linux/drivers/block/nbd.c
<<
>>
Prefs
   1/*
   2 * Network block device - make block devices work over TCP
   3 *
   4 * Note that you can not swap over this thing, yet. Seems to work but
   5 * deadlocks sometimes - you can not swap over TCP in general.
   6 * 
   7 * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
   8 * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
   9 *
  10 * This file is released under GPLv2 or later.
  11 *
  12 * (part of code stolen from loop.c)
  13 */
  14
  15#include <linux/major.h>
  16
  17#include <linux/blkdev.h>
  18#include <linux/module.h>
  19#include <linux/init.h>
  20#include <linux/sched.h>
  21#include <linux/sched/mm.h>
  22#include <linux/fs.h>
  23#include <linux/bio.h>
  24#include <linux/stat.h>
  25#include <linux/errno.h>
  26#include <linux/file.h>
  27#include <linux/ioctl.h>
  28#include <linux/mutex.h>
  29#include <linux/compiler.h>
  30#include <linux/err.h>
  31#include <linux/kernel.h>
  32#include <linux/slab.h>
  33#include <net/sock.h>
  34#include <linux/net.h>
  35#include <linux/kthread.h>
  36#include <linux/types.h>
  37#include <linux/debugfs.h>
  38#include <linux/blk-mq.h>
  39
  40#include <linux/uaccess.h>
  41#include <asm/types.h>
  42
  43#include <linux/nbd.h>
  44#include <linux/nbd-netlink.h>
  45#include <net/genetlink.h>
  46
  47static DEFINE_IDR(nbd_index_idr);
  48static DEFINE_MUTEX(nbd_index_mutex);
  49static int nbd_total_devices = 0;
  50
  51struct nbd_sock {
  52        struct socket *sock;
  53        struct mutex tx_lock;
  54        struct request *pending;
  55        int sent;
  56        bool dead;
  57        int fallback_index;
  58        int cookie;
  59};
  60
  61struct recv_thread_args {
  62        struct work_struct work;
  63        struct nbd_device *nbd;
  64        int index;
  65};
  66
  67struct link_dead_args {
  68        struct work_struct work;
  69        int index;
  70};
  71
  72#define NBD_TIMEDOUT                    0
  73#define NBD_DISCONNECT_REQUESTED        1
  74#define NBD_DISCONNECTED                2
  75#define NBD_HAS_PID_FILE                3
  76#define NBD_HAS_CONFIG_REF              4
  77#define NBD_BOUND                       5
  78#define NBD_DESTROY_ON_DISCONNECT       6
  79#define NBD_DISCONNECT_ON_CLOSE         7
  80
  81struct nbd_config {
  82        u32 flags;
  83        unsigned long runtime_flags;
  84        u64 dead_conn_timeout;
  85
  86        struct nbd_sock **socks;
  87        int num_connections;
  88        atomic_t live_connections;
  89        wait_queue_head_t conn_wait;
  90
  91        atomic_t recv_threads;
  92        wait_queue_head_t recv_wq;
  93        loff_t blksize;
  94        loff_t bytesize;
  95#if IS_ENABLED(CONFIG_DEBUG_FS)
  96        struct dentry *dbg_dir;
  97#endif
  98};
  99
 100struct nbd_device {
 101        struct blk_mq_tag_set tag_set;
 102
 103        int index;
 104        refcount_t config_refs;
 105        refcount_t refs;
 106        struct nbd_config *config;
 107        struct mutex config_lock;
 108        struct gendisk *disk;
 109
 110        struct list_head list;
 111        struct task_struct *task_recv;
 112        struct task_struct *task_setup;
 113};
 114
 115#define NBD_CMD_REQUEUED        1
 116
 117struct nbd_cmd {
 118        struct nbd_device *nbd;
 119        struct mutex lock;
 120        int index;
 121        int cookie;
 122        blk_status_t status;
 123        unsigned long flags;
 124        u32 cmd_cookie;
 125};
 126
 127#if IS_ENABLED(CONFIG_DEBUG_FS)
 128static struct dentry *nbd_dbg_dir;
 129#endif
 130
 131#define nbd_name(nbd) ((nbd)->disk->disk_name)
 132
 133#define NBD_MAGIC 0x68797548
 134
 135static unsigned int nbds_max = 16;
 136static int max_part = 16;
 137static struct workqueue_struct *recv_workqueue;
 138static int part_shift;
 139
 140static int nbd_dev_dbg_init(struct nbd_device *nbd);
 141static void nbd_dev_dbg_close(struct nbd_device *nbd);
 142static void nbd_config_put(struct nbd_device *nbd);
 143static void nbd_connect_reply(struct genl_info *info, int index);
 144static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info);
 145static void nbd_dead_link_work(struct work_struct *work);
 146static void nbd_disconnect_and_put(struct nbd_device *nbd);
 147
 148static inline struct device *nbd_to_dev(struct nbd_device *nbd)
 149{
 150        return disk_to_dev(nbd->disk);
 151}
 152
 153static void nbd_requeue_cmd(struct nbd_cmd *cmd)
 154{
 155        struct request *req = blk_mq_rq_from_pdu(cmd);
 156
 157        if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags))
 158                blk_mq_requeue_request(req, true);
 159}
 160
 161#define NBD_COOKIE_BITS 32
 162
 163static u64 nbd_cmd_handle(struct nbd_cmd *cmd)
 164{
 165        struct request *req = blk_mq_rq_from_pdu(cmd);
 166        u32 tag = blk_mq_unique_tag(req);
 167        u64 cookie = cmd->cmd_cookie;
 168
 169        return (cookie << NBD_COOKIE_BITS) | tag;
 170}
 171
 172static u32 nbd_handle_to_tag(u64 handle)
 173{
 174        return (u32)handle;
 175}
 176
 177static u32 nbd_handle_to_cookie(u64 handle)
 178{
 179        return (u32)(handle >> NBD_COOKIE_BITS);
 180}
 181
 182static const char *nbdcmd_to_ascii(int cmd)
 183{
 184        switch (cmd) {
 185        case  NBD_CMD_READ: return "read";
 186        case NBD_CMD_WRITE: return "write";
 187        case  NBD_CMD_DISC: return "disconnect";
 188        case NBD_CMD_FLUSH: return "flush";
 189        case  NBD_CMD_TRIM: return "trim/discard";
 190        }
 191        return "invalid";
 192}
 193
 194static ssize_t pid_show(struct device *dev,
 195                        struct device_attribute *attr, char *buf)
 196{
 197        struct gendisk *disk = dev_to_disk(dev);
 198        struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
 199
 200        return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
 201}
 202
 203static const struct device_attribute pid_attr = {
 204        .attr = { .name = "pid", .mode = 0444},
 205        .show = pid_show,
 206};
 207
 208static void nbd_dev_remove(struct nbd_device *nbd)
 209{
 210        struct gendisk *disk = nbd->disk;
 211        struct request_queue *q;
 212
 213        if (disk) {
 214                q = disk->queue;
 215                del_gendisk(disk);
 216                blk_cleanup_queue(q);
 217                blk_mq_free_tag_set(&nbd->tag_set);
 218                disk->private_data = NULL;
 219                put_disk(disk);
 220        }
 221        kfree(nbd);
 222}
 223
 224static void nbd_put(struct nbd_device *nbd)
 225{
 226        if (refcount_dec_and_mutex_lock(&nbd->refs,
 227                                        &nbd_index_mutex)) {
 228                idr_remove(&nbd_index_idr, nbd->index);
 229                mutex_unlock(&nbd_index_mutex);
 230                nbd_dev_remove(nbd);
 231        }
 232}
 233
 234static int nbd_disconnected(struct nbd_config *config)
 235{
 236        return test_bit(NBD_DISCONNECTED, &config->runtime_flags) ||
 237                test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
 238}
 239
 240static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
 241                                int notify)
 242{
 243        if (!nsock->dead && notify && !nbd_disconnected(nbd->config)) {
 244                struct link_dead_args *args;
 245                args = kmalloc(sizeof(struct link_dead_args), GFP_NOIO);
 246                if (args) {
 247                        INIT_WORK(&args->work, nbd_dead_link_work);
 248                        args->index = nbd->index;
 249                        queue_work(system_wq, &args->work);
 250                }
 251        }
 252        if (!nsock->dead) {
 253                kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
 254                if (atomic_dec_return(&nbd->config->live_connections) == 0) {
 255                        if (test_and_clear_bit(NBD_DISCONNECT_REQUESTED,
 256                                               &nbd->config->runtime_flags)) {
 257                                set_bit(NBD_DISCONNECTED,
 258                                        &nbd->config->runtime_flags);
 259                                dev_info(nbd_to_dev(nbd),
 260                                        "Disconnected due to user request.\n");
 261                        }
 262                }
 263        }
 264        nsock->dead = true;
 265        nsock->pending = NULL;
 266        nsock->sent = 0;
 267}
 268
 269static void nbd_size_clear(struct nbd_device *nbd)
 270{
 271        if (nbd->config->bytesize) {
 272                set_capacity(nbd->disk, 0);
 273                kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
 274        }
 275}
 276
 277static void nbd_size_update(struct nbd_device *nbd)
 278{
 279        struct nbd_config *config = nbd->config;
 280        struct block_device *bdev = bdget_disk(nbd->disk, 0);
 281
 282        if (config->flags & NBD_FLAG_SEND_TRIM) {
 283                nbd->disk->queue->limits.discard_granularity = config->blksize;
 284                nbd->disk->queue->limits.discard_alignment = config->blksize;
 285                blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
 286        }
 287        blk_queue_logical_block_size(nbd->disk->queue, config->blksize);
 288        blk_queue_physical_block_size(nbd->disk->queue, config->blksize);
 289        set_capacity(nbd->disk, config->bytesize >> 9);
 290        if (bdev) {
 291                if (bdev->bd_disk)
 292                        bd_set_size(bdev, config->bytesize);
 293                else
 294                        bdev->bd_invalidated = 1;
 295                bdput(bdev);
 296        }
 297        kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
 298}
 299
 300static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize,
 301                         loff_t nr_blocks)
 302{
 303        struct nbd_config *config = nbd->config;
 304        config->blksize = blocksize;
 305        config->bytesize = blocksize * nr_blocks;
 306        if (nbd->task_recv != NULL)
 307                nbd_size_update(nbd);
 308}
 309
 310static void nbd_complete_rq(struct request *req)
 311{
 312        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 313
 314        dev_dbg(nbd_to_dev(cmd->nbd), "request %p: %s\n", req,
 315                cmd->status ? "failed" : "done");
 316
 317        blk_mq_end_request(req, cmd->status);
 318}
 319
 320/*
 321 * Forcibly shutdown the socket causing all listeners to error
 322 */
 323static void sock_shutdown(struct nbd_device *nbd)
 324{
 325        struct nbd_config *config = nbd->config;
 326        int i;
 327
 328        if (config->num_connections == 0)
 329                return;
 330        if (test_and_set_bit(NBD_DISCONNECTED, &config->runtime_flags))
 331                return;
 332
 333        for (i = 0; i < config->num_connections; i++) {
 334                struct nbd_sock *nsock = config->socks[i];
 335                mutex_lock(&nsock->tx_lock);
 336                nbd_mark_nsock_dead(nbd, nsock, 0);
 337                mutex_unlock(&nsock->tx_lock);
 338        }
 339        dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
 340}
 341
 342static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 343                                                 bool reserved)
 344{
 345        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 346        struct nbd_device *nbd = cmd->nbd;
 347        struct nbd_config *config;
 348
 349        if (!refcount_inc_not_zero(&nbd->config_refs)) {
 350                cmd->status = BLK_STS_TIMEOUT;
 351                goto done;
 352        }
 353        config = nbd->config;
 354
 355        if (!mutex_trylock(&cmd->lock))
 356                return BLK_EH_RESET_TIMER;
 357
 358        if (config->num_connections > 1) {
 359                dev_err_ratelimited(nbd_to_dev(nbd),
 360                                    "Connection timed out, retrying (%d/%d alive)\n",
 361                                    atomic_read(&config->live_connections),
 362                                    config->num_connections);
 363                /*
 364                 * Hooray we have more connections, requeue this IO, the submit
 365                 * path will put it on a real connection.
 366                 */
 367                if (config->socks && config->num_connections > 1) {
 368                        if (cmd->index < config->num_connections) {
 369                                struct nbd_sock *nsock =
 370                                        config->socks[cmd->index];
 371                                mutex_lock(&nsock->tx_lock);
 372                                /* We can have multiple outstanding requests, so
 373                                 * we don't want to mark the nsock dead if we've
 374                                 * already reconnected with a new socket, so
 375                                 * only mark it dead if its the same socket we
 376                                 * were sent out on.
 377                                 */
 378                                if (cmd->cookie == nsock->cookie)
 379                                        nbd_mark_nsock_dead(nbd, nsock, 1);
 380                                mutex_unlock(&nsock->tx_lock);
 381                        }
 382                        mutex_unlock(&cmd->lock);
 383                        nbd_requeue_cmd(cmd);
 384                        nbd_config_put(nbd);
 385                        return BLK_EH_DONE;
 386                }
 387        } else {
 388                dev_err_ratelimited(nbd_to_dev(nbd),
 389                                    "Connection timed out\n");
 390        }
 391        set_bit(NBD_TIMEDOUT, &config->runtime_flags);
 392        cmd->status = BLK_STS_IOERR;
 393        mutex_unlock(&cmd->lock);
 394        sock_shutdown(nbd);
 395        nbd_config_put(nbd);
 396done:
 397        blk_mq_complete_request(req);
 398        return BLK_EH_DONE;
 399}
 400
 401/*
 402 *  Send or receive packet.
 403 */
 404static int sock_xmit(struct nbd_device *nbd, int index, int send,
 405                     struct iov_iter *iter, int msg_flags, int *sent)
 406{
 407        struct nbd_config *config = nbd->config;
 408        struct socket *sock = config->socks[index]->sock;
 409        int result;
 410        struct msghdr msg;
 411        unsigned int noreclaim_flag;
 412
 413        if (unlikely(!sock)) {
 414                dev_err_ratelimited(disk_to_dev(nbd->disk),
 415                        "Attempted %s on closed socket in sock_xmit\n",
 416                        (send ? "send" : "recv"));
 417                return -EINVAL;
 418        }
 419
 420        msg.msg_iter = *iter;
 421
 422        noreclaim_flag = memalloc_noreclaim_save();
 423        do {
 424                sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
 425                msg.msg_name = NULL;
 426                msg.msg_namelen = 0;
 427                msg.msg_control = NULL;
 428                msg.msg_controllen = 0;
 429                msg.msg_flags = msg_flags | MSG_NOSIGNAL;
 430
 431                if (send)
 432                        result = sock_sendmsg(sock, &msg);
 433                else
 434                        result = sock_recvmsg(sock, &msg, msg.msg_flags);
 435
 436                if (result <= 0) {
 437                        if (result == 0)
 438                                result = -EPIPE; /* short read */
 439                        break;
 440                }
 441                if (sent)
 442                        *sent += result;
 443        } while (msg_data_left(&msg));
 444
 445        memalloc_noreclaim_restore(noreclaim_flag);
 446
 447        return result;
 448}
 449
 450/*
 451 * Different settings for sk->sk_sndtimeo can result in different return values
 452 * if there is a signal pending when we enter sendmsg, because reasons?
 453 */
 454static inline int was_interrupted(int result)
 455{
 456        return result == -ERESTARTSYS || result == -EINTR;
 457}
 458
 459/* always call with the tx_lock held */
 460static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 461{
 462        struct request *req = blk_mq_rq_from_pdu(cmd);
 463        struct nbd_config *config = nbd->config;
 464        struct nbd_sock *nsock = config->socks[index];
 465        int result;
 466        struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
 467        struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
 468        struct iov_iter from;
 469        unsigned long size = blk_rq_bytes(req);
 470        struct bio *bio;
 471        u64 handle;
 472        u32 type;
 473        u32 nbd_cmd_flags = 0;
 474        int sent = nsock->sent, skip = 0;
 475
 476        iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request));
 477
 478        switch (req_op(req)) {
 479        case REQ_OP_DISCARD:
 480                type = NBD_CMD_TRIM;
 481                break;
 482        case REQ_OP_FLUSH:
 483                type = NBD_CMD_FLUSH;
 484                break;
 485        case REQ_OP_WRITE:
 486                type = NBD_CMD_WRITE;
 487                break;
 488        case REQ_OP_READ:
 489                type = NBD_CMD_READ;
 490                break;
 491        default:
 492                return -EIO;
 493        }
 494
 495        if (rq_data_dir(req) == WRITE &&
 496            (config->flags & NBD_FLAG_READ_ONLY)) {
 497                dev_err_ratelimited(disk_to_dev(nbd->disk),
 498                                    "Write on read-only\n");
 499                return -EIO;
 500        }
 501
 502        if (req->cmd_flags & REQ_FUA)
 503                nbd_cmd_flags |= NBD_CMD_FLAG_FUA;
 504
 505        /* We did a partial send previously, and we at least sent the whole
 506         * request struct, so just go and send the rest of the pages in the
 507         * request.
 508         */
 509        if (sent) {
 510                if (sent >= sizeof(request)) {
 511                        skip = sent - sizeof(request);
 512                        goto send_pages;
 513                }
 514                iov_iter_advance(&from, sent);
 515        } else {
 516                cmd->cmd_cookie++;
 517        }
 518        cmd->index = index;
 519        cmd->cookie = nsock->cookie;
 520        request.type = htonl(type | nbd_cmd_flags);
 521        if (type != NBD_CMD_FLUSH) {
 522                request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
 523                request.len = htonl(size);
 524        }
 525        handle = nbd_cmd_handle(cmd);
 526        memcpy(request.handle, &handle, sizeof(handle));
 527
 528        dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
 529                req, nbdcmd_to_ascii(type),
 530                (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
 531        result = sock_xmit(nbd, index, 1, &from,
 532                        (type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
 533        if (result <= 0) {
 534                if (was_interrupted(result)) {
 535                        /* If we havne't sent anything we can just return BUSY,
 536                         * however if we have sent something we need to make
 537                         * sure we only allow this req to be sent until we are
 538                         * completely done.
 539                         */
 540                        if (sent) {
 541                                nsock->pending = req;
 542                                nsock->sent = sent;
 543                        }
 544                        set_bit(NBD_CMD_REQUEUED, &cmd->flags);
 545                        return BLK_STS_RESOURCE;
 546                }
 547                dev_err_ratelimited(disk_to_dev(nbd->disk),
 548                        "Send control failed (result %d)\n", result);
 549                return -EAGAIN;
 550        }
 551send_pages:
 552        if (type != NBD_CMD_WRITE)
 553                goto out;
 554
 555        bio = req->bio;
 556        while (bio) {
 557                struct bio *next = bio->bi_next;
 558                struct bvec_iter iter;
 559                struct bio_vec bvec;
 560
 561                bio_for_each_segment(bvec, bio, iter) {
 562                        bool is_last = !next && bio_iter_last(bvec, iter);
 563                        int flags = is_last ? 0 : MSG_MORE;
 564
 565                        dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
 566                                req, bvec.bv_len);
 567                        iov_iter_bvec(&from, ITER_BVEC | WRITE,
 568                                      &bvec, 1, bvec.bv_len);
 569                        if (skip) {
 570                                if (skip >= iov_iter_count(&from)) {
 571                                        skip -= iov_iter_count(&from);
 572                                        continue;
 573                                }
 574                                iov_iter_advance(&from, skip);
 575                                skip = 0;
 576                        }
 577                        result = sock_xmit(nbd, index, 1, &from, flags, &sent);
 578                        if (result <= 0) {
 579                                if (was_interrupted(result)) {
 580                                        /* We've already sent the header, we
 581                                         * have no choice but to set pending and
 582                                         * return BUSY.
 583                                         */
 584                                        nsock->pending = req;
 585                                        nsock->sent = sent;
 586                                        set_bit(NBD_CMD_REQUEUED, &cmd->flags);
 587                                        return BLK_STS_RESOURCE;
 588                                }
 589                                dev_err(disk_to_dev(nbd->disk),
 590                                        "Send data failed (result %d)\n",
 591                                        result);
 592                                return -EAGAIN;
 593                        }
 594                        /*
 595                         * The completion might already have come in,
 596                         * so break for the last one instead of letting
 597                         * the iterator do it. This prevents use-after-free
 598                         * of the bio.
 599                         */
 600                        if (is_last)
 601                                break;
 602                }
 603                bio = next;
 604        }
 605out:
 606        nsock->pending = NULL;
 607        nsock->sent = 0;
 608        return 0;
 609}
 610
 611/* NULL returned = something went wrong, inform userspace */
 612static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
 613{
 614        struct nbd_config *config = nbd->config;
 615        int result;
 616        struct nbd_reply reply;
 617        struct nbd_cmd *cmd;
 618        struct request *req = NULL;
 619        u64 handle;
 620        u16 hwq;
 621        u32 tag;
 622        struct kvec iov = {.iov_base = &reply, .iov_len = sizeof(reply)};
 623        struct iov_iter to;
 624        int ret = 0;
 625
 626        reply.magic = 0;
 627        iov_iter_kvec(&to, READ | ITER_KVEC, &iov, 1, sizeof(reply));
 628        result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
 629        if (result <= 0) {
 630                if (!nbd_disconnected(config))
 631                        dev_err(disk_to_dev(nbd->disk),
 632                                "Receive control failed (result %d)\n", result);
 633                return ERR_PTR(result);
 634        }
 635
 636        if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
 637                dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
 638                                (unsigned long)ntohl(reply.magic));
 639                return ERR_PTR(-EPROTO);
 640        }
 641
 642        memcpy(&handle, reply.handle, sizeof(handle));
 643        tag = nbd_handle_to_tag(handle);
 644        hwq = blk_mq_unique_tag_to_hwq(tag);
 645        if (hwq < nbd->tag_set.nr_hw_queues)
 646                req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq],
 647                                       blk_mq_unique_tag_to_tag(tag));
 648        if (!req || !blk_mq_request_started(req)) {
 649                dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n",
 650                        tag, req);
 651                return ERR_PTR(-ENOENT);
 652        }
 653        cmd = blk_mq_rq_to_pdu(req);
 654
 655        mutex_lock(&cmd->lock);
 656        if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) {
 657                dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n",
 658                        req, cmd->cmd_cookie, nbd_handle_to_cookie(handle));
 659                ret = -ENOENT;
 660                goto out;
 661        }
 662        if (test_bit(NBD_CMD_REQUEUED, &cmd->flags)) {
 663                dev_err(disk_to_dev(nbd->disk), "Raced with timeout on req %p\n",
 664                        req);
 665                ret = -ENOENT;
 666                goto out;
 667        }
 668        if (ntohl(reply.error)) {
 669                dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
 670                        ntohl(reply.error));
 671                cmd->status = BLK_STS_IOERR;
 672                goto out;
 673        }
 674
 675        dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
 676        if (rq_data_dir(req) != WRITE) {
 677                struct req_iterator iter;
 678                struct bio_vec bvec;
 679
 680                rq_for_each_segment(bvec, req, iter) {
 681                        iov_iter_bvec(&to, ITER_BVEC | READ,
 682                                      &bvec, 1, bvec.bv_len);
 683                        result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
 684                        if (result <= 0) {
 685                                dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
 686                                        result);
 687                                /*
 688                                 * If we've disconnected or we only have 1
 689                                 * connection then we need to make sure we
 690                                 * complete this request, otherwise error out
 691                                 * and let the timeout stuff handle resubmitting
 692                                 * this request onto another connection.
 693                                 */
 694                                if (nbd_disconnected(config) ||
 695                                    config->num_connections <= 1) {
 696                                        cmd->status = BLK_STS_IOERR;
 697                                        goto out;
 698                                }
 699                                ret = -EIO;
 700                                goto out;
 701                        }
 702                        dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
 703                                req, bvec.bv_len);
 704                }
 705        }
 706out:
 707        mutex_unlock(&cmd->lock);
 708        return ret ? ERR_PTR(ret) : cmd;
 709}
 710
 711static void recv_work(struct work_struct *work)
 712{
 713        struct recv_thread_args *args = container_of(work,
 714                                                     struct recv_thread_args,
 715                                                     work);
 716        struct nbd_device *nbd = args->nbd;
 717        struct nbd_config *config = nbd->config;
 718        struct nbd_cmd *cmd;
 719
 720        while (1) {
 721                cmd = nbd_read_stat(nbd, args->index);
 722                if (IS_ERR(cmd)) {
 723                        struct nbd_sock *nsock = config->socks[args->index];
 724
 725                        mutex_lock(&nsock->tx_lock);
 726                        nbd_mark_nsock_dead(nbd, nsock, 1);
 727                        mutex_unlock(&nsock->tx_lock);
 728                        break;
 729                }
 730
 731                blk_mq_complete_request(blk_mq_rq_from_pdu(cmd));
 732        }
 733        atomic_dec(&config->recv_threads);
 734        wake_up(&config->recv_wq);
 735        nbd_config_put(nbd);
 736        kfree(args);
 737}
 738
 739static void nbd_clear_req(struct request *req, void *data, bool reserved)
 740{
 741        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 742
 743        cmd->status = BLK_STS_IOERR;
 744        blk_mq_complete_request(req);
 745}
 746
 747static void nbd_clear_que(struct nbd_device *nbd)
 748{
 749        blk_mq_quiesce_queue(nbd->disk->queue);
 750        blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
 751        blk_mq_unquiesce_queue(nbd->disk->queue);
 752        dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
 753}
 754
 755static int find_fallback(struct nbd_device *nbd, int index)
 756{
 757        struct nbd_config *config = nbd->config;
 758        int new_index = -1;
 759        struct nbd_sock *nsock = config->socks[index];
 760        int fallback = nsock->fallback_index;
 761
 762        if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
 763                return new_index;
 764
 765        if (config->num_connections <= 1) {
 766                dev_err_ratelimited(disk_to_dev(nbd->disk),
 767                                    "Attempted send on invalid socket\n");
 768                return new_index;
 769        }
 770
 771        if (fallback >= 0 && fallback < config->num_connections &&
 772            !config->socks[fallback]->dead)
 773                return fallback;
 774
 775        if (nsock->fallback_index < 0 ||
 776            nsock->fallback_index >= config->num_connections ||
 777            config->socks[nsock->fallback_index]->dead) {
 778                int i;
 779                for (i = 0; i < config->num_connections; i++) {
 780                        if (i == index)
 781                                continue;
 782                        if (!config->socks[i]->dead) {
 783                                new_index = i;
 784                                break;
 785                        }
 786                }
 787                nsock->fallback_index = new_index;
 788                if (new_index < 0) {
 789                        dev_err_ratelimited(disk_to_dev(nbd->disk),
 790                                            "Dead connection, failed to find a fallback\n");
 791                        return new_index;
 792                }
 793        }
 794        new_index = nsock->fallback_index;
 795        return new_index;
 796}
 797
 798static int wait_for_reconnect(struct nbd_device *nbd)
 799{
 800        struct nbd_config *config = nbd->config;
 801        if (!config->dead_conn_timeout)
 802                return 0;
 803        if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
 804                return 0;
 805        return wait_event_timeout(config->conn_wait,
 806                                  atomic_read(&config->live_connections) > 0,
 807                                  config->dead_conn_timeout) > 0;
 808}
 809
 810static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
 811{
 812        struct request *req = blk_mq_rq_from_pdu(cmd);
 813        struct nbd_device *nbd = cmd->nbd;
 814        struct nbd_config *config;
 815        struct nbd_sock *nsock;
 816        int ret;
 817
 818        if (!refcount_inc_not_zero(&nbd->config_refs)) {
 819                dev_err_ratelimited(disk_to_dev(nbd->disk),
 820                                    "Socks array is empty\n");
 821                blk_mq_start_request(req);
 822                return -EINVAL;
 823        }
 824        config = nbd->config;
 825
 826        if (index >= config->num_connections) {
 827                dev_err_ratelimited(disk_to_dev(nbd->disk),
 828                                    "Attempted send on invalid socket\n");
 829                nbd_config_put(nbd);
 830                blk_mq_start_request(req);
 831                return -EINVAL;
 832        }
 833        cmd->status = BLK_STS_OK;
 834again:
 835        nsock = config->socks[index];
 836        mutex_lock(&nsock->tx_lock);
 837        if (nsock->dead) {
 838                int old_index = index;
 839                index = find_fallback(nbd, index);
 840                mutex_unlock(&nsock->tx_lock);
 841                if (index < 0) {
 842                        if (wait_for_reconnect(nbd)) {
 843                                index = old_index;
 844                                goto again;
 845                        }
 846                        /* All the sockets should already be down at this point,
 847                         * we just want to make sure that DISCONNECTED is set so
 848                         * any requests that come in that were queue'ed waiting
 849                         * for the reconnect timer don't trigger the timer again
 850                         * and instead just error out.
 851                         */
 852                        sock_shutdown(nbd);
 853                        nbd_config_put(nbd);
 854                        blk_mq_start_request(req);
 855                        return -EIO;
 856                }
 857                goto again;
 858        }
 859
 860        /* Handle the case that we have a pending request that was partially
 861         * transmitted that _has_ to be serviced first.  We need to call requeue
 862         * here so that it gets put _after_ the request that is already on the
 863         * dispatch list.
 864         */
 865        blk_mq_start_request(req);
 866        if (unlikely(nsock->pending && nsock->pending != req)) {
 867                nbd_requeue_cmd(cmd);
 868                ret = 0;
 869                goto out;
 870        }
 871        /*
 872         * Some failures are related to the link going down, so anything that
 873         * returns EAGAIN can be retried on a different socket.
 874         */
 875        ret = nbd_send_cmd(nbd, cmd, index);
 876        if (ret == -EAGAIN) {
 877                dev_err_ratelimited(disk_to_dev(nbd->disk),
 878                                    "Request send failed, requeueing\n");
 879                nbd_mark_nsock_dead(nbd, nsock, 1);
 880                nbd_requeue_cmd(cmd);
 881                ret = 0;
 882        }
 883out:
 884        mutex_unlock(&nsock->tx_lock);
 885        nbd_config_put(nbd);
 886        return ret;
 887}
 888
 889static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
 890                        const struct blk_mq_queue_data *bd)
 891{
 892        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
 893        int ret;
 894
 895        /*
 896         * Since we look at the bio's to send the request over the network we
 897         * need to make sure the completion work doesn't mark this request done
 898         * before we are done doing our send.  This keeps us from dereferencing
 899         * freed data if we have particularly fast completions (ie we get the
 900         * completion before we exit sock_xmit on the last bvec) or in the case
 901         * that the server is misbehaving (or there was an error) before we're
 902         * done sending everything over the wire.
 903         */
 904        mutex_lock(&cmd->lock);
 905        clear_bit(NBD_CMD_REQUEUED, &cmd->flags);
 906
 907        /* We can be called directly from the user space process, which means we
 908         * could possibly have signals pending so our sendmsg will fail.  In
 909         * this case we need to return that we are busy, otherwise error out as
 910         * appropriate.
 911         */
 912        ret = nbd_handle_cmd(cmd, hctx->queue_num);
 913        if (ret < 0)
 914                ret = BLK_STS_IOERR;
 915        else if (!ret)
 916                ret = BLK_STS_OK;
 917        mutex_unlock(&cmd->lock);
 918
 919        return ret;
 920}
 921
 922static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
 923                          bool netlink)
 924{
 925        struct nbd_config *config = nbd->config;
 926        struct socket *sock;
 927        struct nbd_sock **socks;
 928        struct nbd_sock *nsock;
 929        int err;
 930
 931        sock = sockfd_lookup(arg, &err);
 932        if (!sock)
 933                return err;
 934
 935        if (!netlink && !nbd->task_setup &&
 936            !test_bit(NBD_BOUND, &config->runtime_flags))
 937                nbd->task_setup = current;
 938
 939        if (!netlink &&
 940            (nbd->task_setup != current ||
 941             test_bit(NBD_BOUND, &config->runtime_flags))) {
 942                dev_err(disk_to_dev(nbd->disk),
 943                        "Device being setup by another task");
 944                sockfd_put(sock);
 945                return -EBUSY;
 946        }
 947
 948        socks = krealloc(config->socks, (config->num_connections + 1) *
 949                         sizeof(struct nbd_sock *), GFP_KERNEL);
 950        if (!socks) {
 951                sockfd_put(sock);
 952                return -ENOMEM;
 953        }
 954        nsock = kzalloc(sizeof(struct nbd_sock), GFP_KERNEL);
 955        if (!nsock) {
 956                sockfd_put(sock);
 957                return -ENOMEM;
 958        }
 959
 960        config->socks = socks;
 961
 962        nsock->fallback_index = -1;
 963        nsock->dead = false;
 964        mutex_init(&nsock->tx_lock);
 965        nsock->sock = sock;
 966        nsock->pending = NULL;
 967        nsock->sent = 0;
 968        nsock->cookie = 0;
 969        socks[config->num_connections++] = nsock;
 970        atomic_inc(&config->live_connections);
 971
 972        return 0;
 973}
 974
 975static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
 976{
 977        struct nbd_config *config = nbd->config;
 978        struct socket *sock, *old;
 979        struct recv_thread_args *args;
 980        int i;
 981        int err;
 982
 983        sock = sockfd_lookup(arg, &err);
 984        if (!sock)
 985                return err;
 986
 987        args = kzalloc(sizeof(*args), GFP_KERNEL);
 988        if (!args) {
 989                sockfd_put(sock);
 990                return -ENOMEM;
 991        }
 992
 993        for (i = 0; i < config->num_connections; i++) {
 994                struct nbd_sock *nsock = config->socks[i];
 995
 996                if (!nsock->dead)
 997                        continue;
 998
 999                mutex_lock(&nsock->tx_lock);
1000                if (!nsock->dead) {
1001                        mutex_unlock(&nsock->tx_lock);
1002                        continue;
1003                }
1004                sk_set_memalloc(sock->sk);
1005                if (nbd->tag_set.timeout)
1006                        sock->sk->sk_sndtimeo = nbd->tag_set.timeout;
1007                atomic_inc(&config->recv_threads);
1008                refcount_inc(&nbd->config_refs);
1009                old = nsock->sock;
1010                nsock->fallback_index = -1;
1011                nsock->sock = sock;
1012                nsock->dead = false;
1013                INIT_WORK(&args->work, recv_work);
1014                args->index = i;
1015                args->nbd = nbd;
1016                nsock->cookie++;
1017                mutex_unlock(&nsock->tx_lock);
1018                sockfd_put(old);
1019
1020                clear_bit(NBD_DISCONNECTED, &config->runtime_flags);
1021
1022                /* We take the tx_mutex in an error path in the recv_work, so we
1023                 * need to queue_work outside of the tx_mutex.
1024                 */
1025                queue_work(recv_workqueue, &args->work);
1026
1027                atomic_inc(&config->live_connections);
1028                wake_up(&config->conn_wait);
1029                return 0;
1030        }
1031        sockfd_put(sock);
1032        kfree(args);
1033        return -ENOSPC;
1034}
1035
1036static void nbd_bdev_reset(struct block_device *bdev)
1037{
1038        if (bdev->bd_openers > 1)
1039                return;
1040        bd_set_size(bdev, 0);
1041}
1042
1043static void nbd_parse_flags(struct nbd_device *nbd)
1044{
1045        struct nbd_config *config = nbd->config;
1046        if (config->flags & NBD_FLAG_READ_ONLY)
1047                set_disk_ro(nbd->disk, true);
1048        else
1049                set_disk_ro(nbd->disk, false);
1050        if (config->flags & NBD_FLAG_SEND_TRIM)
1051                blk_queue_flag_set(QUEUE_FLAG_DISCARD, nbd->disk->queue);
1052        if (config->flags & NBD_FLAG_SEND_FLUSH) {
1053                if (config->flags & NBD_FLAG_SEND_FUA)
1054                        blk_queue_write_cache(nbd->disk->queue, true, true);
1055                else
1056                        blk_queue_write_cache(nbd->disk->queue, true, false);
1057        }
1058        else
1059                blk_queue_write_cache(nbd->disk->queue, false, false);
1060}
1061
1062static void send_disconnects(struct nbd_device *nbd)
1063{
1064        struct nbd_config *config = nbd->config;
1065        struct nbd_request request = {
1066                .magic = htonl(NBD_REQUEST_MAGIC),
1067                .type = htonl(NBD_CMD_DISC),
1068        };
1069        struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
1070        struct iov_iter from;
1071        int i, ret;
1072
1073        for (i = 0; i < config->num_connections; i++) {
1074                struct nbd_sock *nsock = config->socks[i];
1075
1076                iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request));
1077                mutex_lock(&nsock->tx_lock);
1078                ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
1079                if (ret <= 0)
1080                        dev_err(disk_to_dev(nbd->disk),
1081                                "Send disconnect failed %d\n", ret);
1082                mutex_unlock(&nsock->tx_lock);
1083        }
1084}
1085
1086static int nbd_disconnect(struct nbd_device *nbd)
1087{
1088        struct nbd_config *config = nbd->config;
1089
1090        dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
1091        set_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
1092        send_disconnects(nbd);
1093        return 0;
1094}
1095
1096static void nbd_clear_sock(struct nbd_device *nbd)
1097{
1098        sock_shutdown(nbd);
1099        nbd_clear_que(nbd);
1100        nbd->task_setup = NULL;
1101}
1102
1103static void nbd_config_put(struct nbd_device *nbd)
1104{
1105        if (refcount_dec_and_mutex_lock(&nbd->config_refs,
1106                                        &nbd->config_lock)) {
1107                struct nbd_config *config = nbd->config;
1108                nbd_dev_dbg_close(nbd);
1109                nbd_size_clear(nbd);
1110                if (test_and_clear_bit(NBD_HAS_PID_FILE,
1111                                       &config->runtime_flags))
1112                        device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
1113                nbd->task_recv = NULL;
1114                nbd_clear_sock(nbd);
1115                if (config->num_connections) {
1116                        int i;
1117                        for (i = 0; i < config->num_connections; i++) {
1118                                sockfd_put(config->socks[i]->sock);
1119                                kfree(config->socks[i]);
1120                        }
1121                        kfree(config->socks);
1122                }
1123                kfree(nbd->config);
1124                nbd->config = NULL;
1125
1126                nbd->tag_set.timeout = 0;
1127                nbd->disk->queue->limits.discard_granularity = 0;
1128                nbd->disk->queue->limits.discard_alignment = 0;
1129                blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
1130                blk_queue_flag_clear(QUEUE_FLAG_DISCARD, nbd->disk->queue);
1131
1132                mutex_unlock(&nbd->config_lock);
1133                nbd_put(nbd);
1134                module_put(THIS_MODULE);
1135        }
1136}
1137
1138static int nbd_start_device(struct nbd_device *nbd)
1139{
1140        struct nbd_config *config = nbd->config;
1141        int num_connections = config->num_connections;
1142        int error = 0, i;
1143
1144        if (nbd->task_recv)
1145                return -EBUSY;
1146        if (!config->socks)
1147                return -EINVAL;
1148        if (num_connections > 1 &&
1149            !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) {
1150                dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
1151                return -EINVAL;
1152        }
1153
1154        blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
1155        nbd->task_recv = current;
1156
1157        nbd_parse_flags(nbd);
1158
1159        error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
1160        if (error) {
1161                dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
1162                return error;
1163        }
1164        set_bit(NBD_HAS_PID_FILE, &config->runtime_flags);
1165
1166        nbd_dev_dbg_init(nbd);
1167        for (i = 0; i < num_connections; i++) {
1168                struct recv_thread_args *args;
1169
1170                args = kzalloc(sizeof(*args), GFP_KERNEL);
1171                if (!args) {
1172                        sock_shutdown(nbd);
1173                        return -ENOMEM;
1174                }
1175                sk_set_memalloc(config->socks[i]->sock->sk);
1176                if (nbd->tag_set.timeout)
1177                        config->socks[i]->sock->sk->sk_sndtimeo =
1178                                nbd->tag_set.timeout;
1179                atomic_inc(&config->recv_threads);
1180                refcount_inc(&nbd->config_refs);
1181                INIT_WORK(&args->work, recv_work);
1182                args->nbd = nbd;
1183                args->index = i;
1184                queue_work(recv_workqueue, &args->work);
1185        }
1186        nbd_size_update(nbd);
1187        return error;
1188}
1189
1190static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev)
1191{
1192        struct nbd_config *config = nbd->config;
1193        int ret;
1194
1195        ret = nbd_start_device(nbd);
1196        if (ret)
1197                return ret;
1198
1199        if (max_part)
1200                bdev->bd_invalidated = 1;
1201        mutex_unlock(&nbd->config_lock);
1202        ret = wait_event_interruptible(config->recv_wq,
1203                                         atomic_read(&config->recv_threads) == 0);
1204        if (ret)
1205                sock_shutdown(nbd);
1206        mutex_lock(&nbd->config_lock);
1207        nbd_bdev_reset(bdev);
1208        /* user requested, ignore socket errors */
1209        if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags))
1210                ret = 0;
1211        if (test_bit(NBD_TIMEDOUT, &config->runtime_flags))
1212                ret = -ETIMEDOUT;
1213        return ret;
1214}
1215
1216static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
1217                                 struct block_device *bdev)
1218{
1219        sock_shutdown(nbd);
1220        kill_bdev(bdev);
1221        nbd_bdev_reset(bdev);
1222        if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
1223                               &nbd->config->runtime_flags))
1224                nbd_config_put(nbd);
1225}
1226
1227/* Must be called with config_lock held */
1228static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
1229                       unsigned int cmd, unsigned long arg)
1230{
1231        struct nbd_config *config = nbd->config;
1232
1233        switch (cmd) {
1234        case NBD_DISCONNECT:
1235                return nbd_disconnect(nbd);
1236        case NBD_CLEAR_SOCK:
1237                nbd_clear_sock_ioctl(nbd, bdev);
1238                return 0;
1239        case NBD_SET_SOCK:
1240                return nbd_add_socket(nbd, arg, false);
1241        case NBD_SET_BLKSIZE:
1242                if (!arg || !is_power_of_2(arg) || arg < 512 ||
1243                    arg > PAGE_SIZE)
1244                        return -EINVAL;
1245                nbd_size_set(nbd, arg,
1246                             div_s64(config->bytesize, arg));
1247                return 0;
1248        case NBD_SET_SIZE:
1249                nbd_size_set(nbd, config->blksize,
1250                             div_s64(arg, config->blksize));
1251                return 0;
1252        case NBD_SET_SIZE_BLOCKS:
1253                nbd_size_set(nbd, config->blksize, arg);
1254                return 0;
1255        case NBD_SET_TIMEOUT:
1256                if (arg) {
1257                        nbd->tag_set.timeout = arg * HZ;
1258                        blk_queue_rq_timeout(nbd->disk->queue, arg * HZ);
1259                }
1260                return 0;
1261
1262        case NBD_SET_FLAGS:
1263                config->flags = arg;
1264                return 0;
1265        case NBD_DO_IT:
1266                return nbd_start_device_ioctl(nbd, bdev);
1267        case NBD_CLEAR_QUE:
1268                /*
1269                 * This is for compatibility only.  The queue is always cleared
1270                 * by NBD_DO_IT or NBD_CLEAR_SOCK.
1271                 */
1272                return 0;
1273        case NBD_PRINT_DEBUG:
1274                /*
1275                 * For compatibility only, we no longer keep a list of
1276                 * outstanding requests.
1277                 */
1278                return 0;
1279        }
1280        return -ENOTTY;
1281}
1282
1283static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
1284                     unsigned int cmd, unsigned long arg)
1285{
1286        struct nbd_device *nbd = bdev->bd_disk->private_data;
1287        struct nbd_config *config = nbd->config;
1288        int error = -EINVAL;
1289
1290        if (!capable(CAP_SYS_ADMIN))
1291                return -EPERM;
1292
1293        /* The block layer will pass back some non-nbd ioctls in case we have
1294         * special handling for them, but we don't so just return an error.
1295         */
1296        if (_IOC_TYPE(cmd) != 0xab)
1297                return -EINVAL;
1298
1299        mutex_lock(&nbd->config_lock);
1300
1301        /* Don't allow ioctl operations on a nbd device that was created with
1302         * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine.
1303         */
1304        if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
1305            (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK))
1306                error = __nbd_ioctl(bdev, nbd, cmd, arg);
1307        else
1308                dev_err(nbd_to_dev(nbd), "Cannot use ioctl interface on a netlink controlled device.\n");
1309        mutex_unlock(&nbd->config_lock);
1310        return error;
1311}
1312
1313static struct nbd_config *nbd_alloc_config(void)
1314{
1315        struct nbd_config *config;
1316
1317        config = kzalloc(sizeof(struct nbd_config), GFP_NOFS);
1318        if (!config)
1319                return NULL;
1320        atomic_set(&config->recv_threads, 0);
1321        init_waitqueue_head(&config->recv_wq);
1322        init_waitqueue_head(&config->conn_wait);
1323        config->blksize = 1024;
1324        atomic_set(&config->live_connections, 0);
1325        try_module_get(THIS_MODULE);
1326        return config;
1327}
1328
1329static int nbd_open(struct block_device *bdev, fmode_t mode)
1330{
1331        struct nbd_device *nbd;
1332        int ret = 0;
1333
1334        mutex_lock(&nbd_index_mutex);
1335        nbd = bdev->bd_disk->private_data;
1336        if (!nbd) {
1337                ret = -ENXIO;
1338                goto out;
1339        }
1340        if (!refcount_inc_not_zero(&nbd->refs)) {
1341                ret = -ENXIO;
1342                goto out;
1343        }
1344        if (!refcount_inc_not_zero(&nbd->config_refs)) {
1345                struct nbd_config *config;
1346
1347                mutex_lock(&nbd->config_lock);
1348                if (refcount_inc_not_zero(&nbd->config_refs)) {
1349                        mutex_unlock(&nbd->config_lock);
1350                        goto out;
1351                }
1352                config = nbd->config = nbd_alloc_config();
1353                if (!config) {
1354                        ret = -ENOMEM;
1355                        mutex_unlock(&nbd->config_lock);
1356                        goto out;
1357                }
1358                refcount_set(&nbd->config_refs, 1);
1359                refcount_inc(&nbd->refs);
1360                mutex_unlock(&nbd->config_lock);
1361                bdev->bd_invalidated = 1;
1362        } else if (nbd_disconnected(nbd->config)) {
1363                bdev->bd_invalidated = 1;
1364        }
1365out:
1366        mutex_unlock(&nbd_index_mutex);
1367        return ret;
1368}
1369
1370static void nbd_release(struct gendisk *disk, fmode_t mode)
1371{
1372        struct nbd_device *nbd = disk->private_data;
1373        struct block_device *bdev = bdget_disk(disk, 0);
1374
1375        if (test_bit(NBD_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) &&
1376                        bdev->bd_openers == 0)
1377                nbd_disconnect_and_put(nbd);
1378
1379        nbd_config_put(nbd);
1380        nbd_put(nbd);
1381}
1382
1383static const struct block_device_operations nbd_fops =
1384{
1385        .owner =        THIS_MODULE,
1386        .open =         nbd_open,
1387        .release =      nbd_release,
1388        .ioctl =        nbd_ioctl,
1389        .compat_ioctl = nbd_ioctl,
1390};
1391
1392#if IS_ENABLED(CONFIG_DEBUG_FS)
1393
1394static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
1395{
1396        struct nbd_device *nbd = s->private;
1397
1398        if (nbd->task_recv)
1399                seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
1400
1401        return 0;
1402}
1403
1404static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
1405{
1406        return single_open(file, nbd_dbg_tasks_show, inode->i_private);
1407}
1408
1409static const struct file_operations nbd_dbg_tasks_ops = {
1410        .open = nbd_dbg_tasks_open,
1411        .read = seq_read,
1412        .llseek = seq_lseek,
1413        .release = single_release,
1414};
1415
1416static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
1417{
1418        struct nbd_device *nbd = s->private;
1419        u32 flags = nbd->config->flags;
1420
1421        seq_printf(s, "Hex: 0x%08x\n\n", flags);
1422
1423        seq_puts(s, "Known flags:\n");
1424
1425        if (flags & NBD_FLAG_HAS_FLAGS)
1426                seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
1427        if (flags & NBD_FLAG_READ_ONLY)
1428                seq_puts(s, "NBD_FLAG_READ_ONLY\n");
1429        if (flags & NBD_FLAG_SEND_FLUSH)
1430                seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
1431        if (flags & NBD_FLAG_SEND_FUA)
1432                seq_puts(s, "NBD_FLAG_SEND_FUA\n");
1433        if (flags & NBD_FLAG_SEND_TRIM)
1434                seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
1435
1436        return 0;
1437}
1438
1439static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
1440{
1441        return single_open(file, nbd_dbg_flags_show, inode->i_private);
1442}
1443
1444static const struct file_operations nbd_dbg_flags_ops = {
1445        .open = nbd_dbg_flags_open,
1446        .read = seq_read,
1447        .llseek = seq_lseek,
1448        .release = single_release,
1449};
1450
1451static int nbd_dev_dbg_init(struct nbd_device *nbd)
1452{
1453        struct dentry *dir;
1454        struct nbd_config *config = nbd->config;
1455
1456        if (!nbd_dbg_dir)
1457                return -EIO;
1458
1459        dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
1460        if (!dir) {
1461                dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
1462                        nbd_name(nbd));
1463                return -EIO;
1464        }
1465        config->dbg_dir = dir;
1466
1467        debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
1468        debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize);
1469        debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
1470        debugfs_create_u64("blocksize", 0444, dir, &config->blksize);
1471        debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
1472
1473        return 0;
1474}
1475
1476static void nbd_dev_dbg_close(struct nbd_device *nbd)
1477{
1478        debugfs_remove_recursive(nbd->config->dbg_dir);
1479}
1480
1481static int nbd_dbg_init(void)
1482{
1483        struct dentry *dbg_dir;
1484
1485        dbg_dir = debugfs_create_dir("nbd", NULL);
1486        if (!dbg_dir)
1487                return -EIO;
1488
1489        nbd_dbg_dir = dbg_dir;
1490
1491        return 0;
1492}
1493
1494static void nbd_dbg_close(void)
1495{
1496        debugfs_remove_recursive(nbd_dbg_dir);
1497}
1498
1499#else  /* IS_ENABLED(CONFIG_DEBUG_FS) */
1500
1501static int nbd_dev_dbg_init(struct nbd_device *nbd)
1502{
1503        return 0;
1504}
1505
1506static void nbd_dev_dbg_close(struct nbd_device *nbd)
1507{
1508}
1509
1510static int nbd_dbg_init(void)
1511{
1512        return 0;
1513}
1514
1515static void nbd_dbg_close(void)
1516{
1517}
1518
1519#endif
1520
1521static int nbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
1522                            unsigned int hctx_idx, unsigned int numa_node)
1523{
1524        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
1525        cmd->nbd = set->driver_data;
1526        cmd->flags = 0;
1527        mutex_init(&cmd->lock);
1528        return 0;
1529}
1530
1531static const struct blk_mq_ops nbd_mq_ops = {
1532        .queue_rq       = nbd_queue_rq,
1533        .complete       = nbd_complete_rq,
1534        .init_request   = nbd_init_request,
1535        .timeout        = nbd_xmit_timeout,
1536};
1537
1538static int nbd_dev_add(int index)
1539{
1540        struct nbd_device *nbd;
1541        struct gendisk *disk;
1542        struct request_queue *q;
1543        int err = -ENOMEM;
1544
1545        nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL);
1546        if (!nbd)
1547                goto out;
1548
1549        disk = alloc_disk(1 << part_shift);
1550        if (!disk)
1551                goto out_free_nbd;
1552
1553        if (index >= 0) {
1554                err = idr_alloc(&nbd_index_idr, nbd, index, index + 1,
1555                                GFP_KERNEL);
1556                if (err == -ENOSPC)
1557                        err = -EEXIST;
1558        } else {
1559                err = idr_alloc(&nbd_index_idr, nbd, 0, 0, GFP_KERNEL);
1560                if (err >= 0)
1561                        index = err;
1562        }
1563        if (err < 0)
1564                goto out_free_disk;
1565
1566        nbd->index = index;
1567        nbd->disk = disk;
1568        nbd->tag_set.ops = &nbd_mq_ops;
1569        nbd->tag_set.nr_hw_queues = 1;
1570        nbd->tag_set.queue_depth = 128;
1571        nbd->tag_set.numa_node = NUMA_NO_NODE;
1572        nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
1573        nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
1574                BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING;
1575        nbd->tag_set.driver_data = nbd;
1576
1577        err = blk_mq_alloc_tag_set(&nbd->tag_set);
1578        if (err)
1579                goto out_free_idr;
1580
1581        q = blk_mq_init_queue(&nbd->tag_set);
1582        if (IS_ERR(q)) {
1583                err = PTR_ERR(q);
1584                goto out_free_tags;
1585        }
1586        disk->queue = q;
1587
1588        /*
1589         * Tell the block layer that we are not a rotational device
1590         */
1591        blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
1592        blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
1593        disk->queue->limits.discard_granularity = 0;
1594        disk->queue->limits.discard_alignment = 0;
1595        blk_queue_max_discard_sectors(disk->queue, 0);
1596        blk_queue_max_segment_size(disk->queue, UINT_MAX);
1597        blk_queue_max_segments(disk->queue, USHRT_MAX);
1598        blk_queue_max_hw_sectors(disk->queue, 65536);
1599        disk->queue->limits.max_sectors = 256;
1600
1601        mutex_init(&nbd->config_lock);
1602        refcount_set(&nbd->config_refs, 0);
1603        refcount_set(&nbd->refs, 1);
1604        INIT_LIST_HEAD(&nbd->list);
1605        disk->major = NBD_MAJOR;
1606        disk->first_minor = index << part_shift;
1607        disk->fops = &nbd_fops;
1608        disk->private_data = nbd;
1609        sprintf(disk->disk_name, "nbd%d", index);
1610        add_disk(disk);
1611        nbd_total_devices++;
1612        return index;
1613
1614out_free_tags:
1615        blk_mq_free_tag_set(&nbd->tag_set);
1616out_free_idr:
1617        idr_remove(&nbd_index_idr, index);
1618out_free_disk:
1619        put_disk(disk);
1620out_free_nbd:
1621        kfree(nbd);
1622out:
1623        return err;
1624}
1625
1626static int find_free_cb(int id, void *ptr, void *data)
1627{
1628        struct nbd_device *nbd = ptr;
1629        struct nbd_device **found = data;
1630
1631        if (!refcount_read(&nbd->config_refs)) {
1632                *found = nbd;
1633                return 1;
1634        }
1635        return 0;
1636}
1637
1638/* Netlink interface. */
1639static const struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = {
1640        [NBD_ATTR_INDEX]                =       { .type = NLA_U32 },
1641        [NBD_ATTR_SIZE_BYTES]           =       { .type = NLA_U64 },
1642        [NBD_ATTR_BLOCK_SIZE_BYTES]     =       { .type = NLA_U64 },
1643        [NBD_ATTR_TIMEOUT]              =       { .type = NLA_U64 },
1644        [NBD_ATTR_SERVER_FLAGS]         =       { .type = NLA_U64 },
1645        [NBD_ATTR_CLIENT_FLAGS]         =       { .type = NLA_U64 },
1646        [NBD_ATTR_SOCKETS]              =       { .type = NLA_NESTED},
1647        [NBD_ATTR_DEAD_CONN_TIMEOUT]    =       { .type = NLA_U64 },
1648        [NBD_ATTR_DEVICE_LIST]          =       { .type = NLA_NESTED},
1649};
1650
1651static const struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = {
1652        [NBD_SOCK_FD]                   =       { .type = NLA_U32 },
1653};
1654
1655/* We don't use this right now since we don't parse the incoming list, but we
1656 * still want it here so userspace knows what to expect.
1657 */
1658static const struct nla_policy __attribute__((unused))
1659nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = {
1660        [NBD_DEVICE_INDEX]              =       { .type = NLA_U32 },
1661        [NBD_DEVICE_CONNECTED]          =       { .type = NLA_U8 },
1662};
1663
1664static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
1665{
1666        struct nbd_device *nbd = NULL;
1667        struct nbd_config *config;
1668        int index = -1;
1669        int ret;
1670        bool put_dev = false;
1671
1672        if (!netlink_capable(skb, CAP_SYS_ADMIN))
1673                return -EPERM;
1674
1675        if (info->attrs[NBD_ATTR_INDEX])
1676                index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
1677        if (!info->attrs[NBD_ATTR_SOCKETS]) {
1678                printk(KERN_ERR "nbd: must specify at least one socket\n");
1679                return -EINVAL;
1680        }
1681        if (!info->attrs[NBD_ATTR_SIZE_BYTES]) {
1682                printk(KERN_ERR "nbd: must specify a size in bytes for the device\n");
1683                return -EINVAL;
1684        }
1685again:
1686        mutex_lock(&nbd_index_mutex);
1687        if (index == -1) {
1688                ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd);
1689                if (ret == 0) {
1690                        int new_index;
1691                        new_index = nbd_dev_add(-1);
1692                        if (new_index < 0) {
1693                                mutex_unlock(&nbd_index_mutex);
1694                                printk(KERN_ERR "nbd: failed to add new device\n");
1695                                return new_index;
1696                        }
1697                        nbd = idr_find(&nbd_index_idr, new_index);
1698                }
1699        } else {
1700                nbd = idr_find(&nbd_index_idr, index);
1701                if (!nbd) {
1702                        ret = nbd_dev_add(index);
1703                        if (ret < 0) {
1704                                mutex_unlock(&nbd_index_mutex);
1705                                printk(KERN_ERR "nbd: failed to add new device\n");
1706                                return ret;
1707                        }
1708                        nbd = idr_find(&nbd_index_idr, index);
1709                }
1710        }
1711        if (!nbd) {
1712                printk(KERN_ERR "nbd: couldn't find device at index %d\n",
1713                       index);
1714                mutex_unlock(&nbd_index_mutex);
1715                return -EINVAL;
1716        }
1717        if (!refcount_inc_not_zero(&nbd->refs)) {
1718                mutex_unlock(&nbd_index_mutex);
1719                if (index == -1)
1720                        goto again;
1721                printk(KERN_ERR "nbd: device at index %d is going down\n",
1722                       index);
1723                return -EINVAL;
1724        }
1725        mutex_unlock(&nbd_index_mutex);
1726
1727        mutex_lock(&nbd->config_lock);
1728        if (refcount_read(&nbd->config_refs)) {
1729                mutex_unlock(&nbd->config_lock);
1730                nbd_put(nbd);
1731                if (index == -1)
1732                        goto again;
1733                printk(KERN_ERR "nbd: nbd%d already in use\n", index);
1734                return -EBUSY;
1735        }
1736        if (WARN_ON(nbd->config)) {
1737                mutex_unlock(&nbd->config_lock);
1738                nbd_put(nbd);
1739                return -EINVAL;
1740        }
1741        config = nbd->config = nbd_alloc_config();
1742        if (!nbd->config) {
1743                mutex_unlock(&nbd->config_lock);
1744                nbd_put(nbd);
1745                printk(KERN_ERR "nbd: couldn't allocate config\n");
1746                return -ENOMEM;
1747        }
1748        refcount_set(&nbd->config_refs, 1);
1749        set_bit(NBD_BOUND, &config->runtime_flags);
1750
1751        if (info->attrs[NBD_ATTR_SIZE_BYTES]) {
1752                u64 bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]);
1753                nbd_size_set(nbd, config->blksize,
1754                             div64_u64(bytes, config->blksize));
1755        }
1756        if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) {
1757                u64 bsize =
1758                        nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
1759                nbd_size_set(nbd, bsize, div64_u64(config->bytesize, bsize));
1760        }
1761        if (info->attrs[NBD_ATTR_TIMEOUT]) {
1762                u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
1763                nbd->tag_set.timeout = timeout * HZ;
1764                blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
1765        }
1766        if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
1767                config->dead_conn_timeout =
1768                        nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
1769                config->dead_conn_timeout *= HZ;
1770        }
1771        if (info->attrs[NBD_ATTR_SERVER_FLAGS])
1772                config->flags =
1773                        nla_get_u64(info->attrs[NBD_ATTR_SERVER_FLAGS]);
1774        if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
1775                u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
1776                if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
1777                        set_bit(NBD_DESTROY_ON_DISCONNECT,
1778                                &config->runtime_flags);
1779                        put_dev = true;
1780                }
1781                if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
1782                        set_bit(NBD_DISCONNECT_ON_CLOSE,
1783                                &config->runtime_flags);
1784                }
1785        }
1786
1787        if (info->attrs[NBD_ATTR_SOCKETS]) {
1788                struct nlattr *attr;
1789                int rem, fd;
1790
1791                nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
1792                                    rem) {
1793                        struct nlattr *socks[NBD_SOCK_MAX+1];
1794
1795                        if (nla_type(attr) != NBD_SOCK_ITEM) {
1796                                printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
1797                                ret = -EINVAL;
1798                                goto out;
1799                        }
1800                        ret = nla_parse_nested(socks, NBD_SOCK_MAX, attr,
1801                                               nbd_sock_policy, info->extack);
1802                        if (ret != 0) {
1803                                printk(KERN_ERR "nbd: error processing sock list\n");
1804                                ret = -EINVAL;
1805                                goto out;
1806                        }
1807                        if (!socks[NBD_SOCK_FD])
1808                                continue;
1809                        fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
1810                        ret = nbd_add_socket(nbd, fd, true);
1811                        if (ret)
1812                                goto out;
1813                }
1814        }
1815        ret = nbd_start_device(nbd);
1816out:
1817        mutex_unlock(&nbd->config_lock);
1818        if (!ret) {
1819                set_bit(NBD_HAS_CONFIG_REF, &config->runtime_flags);
1820                refcount_inc(&nbd->config_refs);
1821                nbd_connect_reply(info, nbd->index);
1822        }
1823        nbd_config_put(nbd);
1824        if (put_dev)
1825                nbd_put(nbd);
1826        return ret;
1827}
1828
1829static void nbd_disconnect_and_put(struct nbd_device *nbd)
1830{
1831        mutex_lock(&nbd->config_lock);
1832        nbd_disconnect(nbd);
1833        nbd_clear_sock(nbd);
1834        mutex_unlock(&nbd->config_lock);
1835        if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
1836                               &nbd->config->runtime_flags))
1837                nbd_config_put(nbd);
1838}
1839
1840static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
1841{
1842        struct nbd_device *nbd;
1843        int index;
1844
1845        if (!netlink_capable(skb, CAP_SYS_ADMIN))
1846                return -EPERM;
1847
1848        if (!info->attrs[NBD_ATTR_INDEX]) {
1849                printk(KERN_ERR "nbd: must specify an index to disconnect\n");
1850                return -EINVAL;
1851        }
1852        index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
1853        mutex_lock(&nbd_index_mutex);
1854        nbd = idr_find(&nbd_index_idr, index);
1855        if (!nbd) {
1856                mutex_unlock(&nbd_index_mutex);
1857                printk(KERN_ERR "nbd: couldn't find device at index %d\n",
1858                       index);
1859                return -EINVAL;
1860        }
1861        if (!refcount_inc_not_zero(&nbd->refs)) {
1862                mutex_unlock(&nbd_index_mutex);
1863                printk(KERN_ERR "nbd: device at index %d is going down\n",
1864                       index);
1865                return -EINVAL;
1866        }
1867        mutex_unlock(&nbd_index_mutex);
1868        if (!refcount_inc_not_zero(&nbd->config_refs)) {
1869                nbd_put(nbd);
1870                return 0;
1871        }
1872        nbd_disconnect_and_put(nbd);
1873        nbd_config_put(nbd);
1874        nbd_put(nbd);
1875        return 0;
1876}
1877
1878static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
1879{
1880        struct nbd_device *nbd = NULL;
1881        struct nbd_config *config;
1882        int index;
1883        int ret = 0;
1884        bool put_dev = false;
1885
1886        if (!netlink_capable(skb, CAP_SYS_ADMIN))
1887                return -EPERM;
1888
1889        if (!info->attrs[NBD_ATTR_INDEX]) {
1890                printk(KERN_ERR "nbd: must specify a device to reconfigure\n");
1891                return -EINVAL;
1892        }
1893        index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
1894        mutex_lock(&nbd_index_mutex);
1895        nbd = idr_find(&nbd_index_idr, index);
1896        if (!nbd) {
1897                mutex_unlock(&nbd_index_mutex);
1898                printk(KERN_ERR "nbd: couldn't find a device at index %d\n",
1899                       index);
1900                return -EINVAL;
1901        }
1902        if (!refcount_inc_not_zero(&nbd->refs)) {
1903                mutex_unlock(&nbd_index_mutex);
1904                printk(KERN_ERR "nbd: device at index %d is going down\n",
1905                       index);
1906                return -EINVAL;
1907        }
1908        mutex_unlock(&nbd_index_mutex);
1909
1910        if (!refcount_inc_not_zero(&nbd->config_refs)) {
1911                dev_err(nbd_to_dev(nbd),
1912                        "not configured, cannot reconfigure\n");
1913                nbd_put(nbd);
1914                return -EINVAL;
1915        }
1916
1917        mutex_lock(&nbd->config_lock);
1918        config = nbd->config;
1919        if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
1920            !nbd->task_recv) {
1921                dev_err(nbd_to_dev(nbd),
1922                        "not configured, cannot reconfigure\n");
1923                ret = -EINVAL;
1924                goto out;
1925        }
1926
1927        if (info->attrs[NBD_ATTR_TIMEOUT]) {
1928                u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
1929                nbd->tag_set.timeout = timeout * HZ;
1930                blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
1931        }
1932        if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
1933                config->dead_conn_timeout =
1934                        nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
1935                config->dead_conn_timeout *= HZ;
1936        }
1937        if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
1938                u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
1939                if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
1940                        if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
1941                                              &config->runtime_flags))
1942                                put_dev = true;
1943                } else {
1944                        if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
1945                                               &config->runtime_flags))
1946                                refcount_inc(&nbd->refs);
1947                }
1948
1949                if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
1950                        set_bit(NBD_DISCONNECT_ON_CLOSE,
1951                                        &config->runtime_flags);
1952                } else {
1953                        clear_bit(NBD_DISCONNECT_ON_CLOSE,
1954                                        &config->runtime_flags);
1955                }
1956        }
1957
1958        if (info->attrs[NBD_ATTR_SOCKETS]) {
1959                struct nlattr *attr;
1960                int rem, fd;
1961
1962                nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
1963                                    rem) {
1964                        struct nlattr *socks[NBD_SOCK_MAX+1];
1965
1966                        if (nla_type(attr) != NBD_SOCK_ITEM) {
1967                                printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
1968                                ret = -EINVAL;
1969                                goto out;
1970                        }
1971                        ret = nla_parse_nested(socks, NBD_SOCK_MAX, attr,
1972                                               nbd_sock_policy, info->extack);
1973                        if (ret != 0) {
1974                                printk(KERN_ERR "nbd: error processing sock list\n");
1975                                ret = -EINVAL;
1976                                goto out;
1977                        }
1978                        if (!socks[NBD_SOCK_FD])
1979                                continue;
1980                        fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
1981                        ret = nbd_reconnect_socket(nbd, fd);
1982                        if (ret) {
1983                                if (ret == -ENOSPC)
1984                                        ret = 0;
1985                                goto out;
1986                        }
1987                        dev_info(nbd_to_dev(nbd), "reconnected socket\n");
1988                }
1989        }
1990out:
1991        mutex_unlock(&nbd->config_lock);
1992        nbd_config_put(nbd);
1993        nbd_put(nbd);
1994        if (put_dev)
1995                nbd_put(nbd);
1996        return ret;
1997}
1998
1999static const struct genl_ops nbd_connect_genl_ops[] = {
2000        {
2001                .cmd    = NBD_CMD_CONNECT,
2002                .policy = nbd_attr_policy,
2003                .doit   = nbd_genl_connect,
2004        },
2005        {
2006                .cmd    = NBD_CMD_DISCONNECT,
2007                .policy = nbd_attr_policy,
2008                .doit   = nbd_genl_disconnect,
2009        },
2010        {
2011                .cmd    = NBD_CMD_RECONFIGURE,
2012                .policy = nbd_attr_policy,
2013                .doit   = nbd_genl_reconfigure,
2014        },
2015        {
2016                .cmd    = NBD_CMD_STATUS,
2017                .policy = nbd_attr_policy,
2018                .doit   = nbd_genl_status,
2019        },
2020};
2021
2022static const struct genl_multicast_group nbd_mcast_grps[] = {
2023        { .name = NBD_GENL_MCAST_GROUP_NAME, },
2024};
2025
2026static struct genl_family nbd_genl_family __ro_after_init = {
2027        .hdrsize        = 0,
2028        .name           = NBD_GENL_FAMILY_NAME,
2029        .version        = NBD_GENL_VERSION,
2030        .module         = THIS_MODULE,
2031        .ops            = nbd_connect_genl_ops,
2032        .n_ops          = ARRAY_SIZE(nbd_connect_genl_ops),
2033        .maxattr        = NBD_ATTR_MAX,
2034        .mcgrps         = nbd_mcast_grps,
2035        .n_mcgrps       = ARRAY_SIZE(nbd_mcast_grps),
2036};
2037
2038static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
2039{
2040        struct nlattr *dev_opt;
2041        u8 connected = 0;
2042        int ret;
2043
2044        /* This is a little racey, but for status it's ok.  The
2045         * reason we don't take a ref here is because we can't
2046         * take a ref in the index == -1 case as we would need
2047         * to put under the nbd_index_mutex, which could
2048         * deadlock if we are configured to remove ourselves
2049         * once we're disconnected.
2050         */
2051        if (refcount_read(&nbd->config_refs))
2052                connected = 1;
2053        dev_opt = nla_nest_start(reply, NBD_DEVICE_ITEM);
2054        if (!dev_opt)
2055                return -EMSGSIZE;
2056        ret = nla_put_u32(reply, NBD_DEVICE_INDEX, nbd->index);
2057        if (ret)
2058                return -EMSGSIZE;
2059        ret = nla_put_u8(reply, NBD_DEVICE_CONNECTED,
2060                         connected);
2061        if (ret)
2062                return -EMSGSIZE;
2063        nla_nest_end(reply, dev_opt);
2064        return 0;
2065}
2066
2067static int status_cb(int id, void *ptr, void *data)
2068{
2069        struct nbd_device *nbd = ptr;
2070        return populate_nbd_status(nbd, (struct sk_buff *)data);
2071}
2072
2073static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
2074{
2075        struct nlattr *dev_list;
2076        struct sk_buff *reply;
2077        void *reply_head;
2078        size_t msg_size;
2079        int index = -1;
2080        int ret = -ENOMEM;
2081
2082        if (info->attrs[NBD_ATTR_INDEX])
2083                index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
2084
2085        mutex_lock(&nbd_index_mutex);
2086
2087        msg_size = nla_total_size(nla_attr_size(sizeof(u32)) +
2088                                  nla_attr_size(sizeof(u8)));
2089        msg_size *= (index == -1) ? nbd_total_devices : 1;
2090
2091        reply = genlmsg_new(msg_size, GFP_KERNEL);
2092        if (!reply)
2093                goto out;
2094        reply_head = genlmsg_put_reply(reply, info, &nbd_genl_family, 0,
2095                                       NBD_CMD_STATUS);
2096        if (!reply_head) {
2097                nlmsg_free(reply);
2098                goto out;
2099        }
2100
2101        dev_list = nla_nest_start(reply, NBD_ATTR_DEVICE_LIST);
2102        if (index == -1) {
2103                ret = idr_for_each(&nbd_index_idr, &status_cb, reply);
2104                if (ret) {
2105                        nlmsg_free(reply);
2106                        goto out;
2107                }
2108        } else {
2109                struct nbd_device *nbd;
2110                nbd = idr_find(&nbd_index_idr, index);
2111                if (nbd) {
2112                        ret = populate_nbd_status(nbd, reply);
2113                        if (ret) {
2114                                nlmsg_free(reply);
2115                                goto out;
2116                        }
2117                }
2118        }
2119        nla_nest_end(reply, dev_list);
2120        genlmsg_end(reply, reply_head);
2121        genlmsg_reply(reply, info);
2122        ret = 0;
2123out:
2124        mutex_unlock(&nbd_index_mutex);
2125        return ret;
2126}
2127
2128static void nbd_connect_reply(struct genl_info *info, int index)
2129{
2130        struct sk_buff *skb;
2131        void *msg_head;
2132        int ret;
2133
2134        skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
2135        if (!skb)
2136                return;
2137        msg_head = genlmsg_put_reply(skb, info, &nbd_genl_family, 0,
2138                                     NBD_CMD_CONNECT);
2139        if (!msg_head) {
2140                nlmsg_free(skb);
2141                return;
2142        }
2143        ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
2144        if (ret) {
2145                nlmsg_free(skb);
2146                return;
2147        }
2148        genlmsg_end(skb, msg_head);
2149        genlmsg_reply(skb, info);
2150}
2151
2152static void nbd_mcast_index(int index)
2153{
2154        struct sk_buff *skb;
2155        void *msg_head;
2156        int ret;
2157
2158        skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
2159        if (!skb)
2160                return;
2161        msg_head = genlmsg_put(skb, 0, 0, &nbd_genl_family, 0,
2162                                     NBD_CMD_LINK_DEAD);
2163        if (!msg_head) {
2164                nlmsg_free(skb);
2165                return;
2166        }
2167        ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
2168        if (ret) {
2169                nlmsg_free(skb);
2170                return;
2171        }
2172        genlmsg_end(skb, msg_head);
2173        genlmsg_multicast(&nbd_genl_family, skb, 0, 0, GFP_KERNEL);
2174}
2175
2176static void nbd_dead_link_work(struct work_struct *work)
2177{
2178        struct link_dead_args *args = container_of(work, struct link_dead_args,
2179                                                   work);
2180        nbd_mcast_index(args->index);
2181        kfree(args);
2182}
2183
2184static int __init nbd_init(void)
2185{
2186        int i;
2187
2188        BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
2189
2190        if (max_part < 0) {
2191                printk(KERN_ERR "nbd: max_part must be >= 0\n");
2192                return -EINVAL;
2193        }
2194
2195        part_shift = 0;
2196        if (max_part > 0) {
2197                part_shift = fls(max_part);
2198
2199                /*
2200                 * Adjust max_part according to part_shift as it is exported
2201                 * to user space so that user can know the max number of
2202                 * partition kernel should be able to manage.
2203                 *
2204                 * Note that -1 is required because partition 0 is reserved
2205                 * for the whole disk.
2206                 */
2207                max_part = (1UL << part_shift) - 1;
2208        }
2209
2210        if ((1UL << part_shift) > DISK_MAX_PARTS)
2211                return -EINVAL;
2212
2213        if (nbds_max > 1UL << (MINORBITS - part_shift))
2214                return -EINVAL;
2215        recv_workqueue = alloc_workqueue("knbd-recv",
2216                                         WQ_MEM_RECLAIM | WQ_HIGHPRI |
2217                                         WQ_UNBOUND, 0);
2218        if (!recv_workqueue)
2219                return -ENOMEM;
2220
2221        if (register_blkdev(NBD_MAJOR, "nbd")) {
2222                destroy_workqueue(recv_workqueue);
2223                return -EIO;
2224        }
2225
2226        if (genl_register_family(&nbd_genl_family)) {
2227                unregister_blkdev(NBD_MAJOR, "nbd");
2228                destroy_workqueue(recv_workqueue);
2229                return -EINVAL;
2230        }
2231        nbd_dbg_init();
2232
2233        mutex_lock(&nbd_index_mutex);
2234        for (i = 0; i < nbds_max; i++)
2235                nbd_dev_add(i);
2236        mutex_unlock(&nbd_index_mutex);
2237        return 0;
2238}
2239
2240static int nbd_exit_cb(int id, void *ptr, void *data)
2241{
2242        struct list_head *list = (struct list_head *)data;
2243        struct nbd_device *nbd = ptr;
2244
2245        list_add_tail(&nbd->list, list);
2246        return 0;
2247}
2248
2249static void __exit nbd_cleanup(void)
2250{
2251        struct nbd_device *nbd;
2252        LIST_HEAD(del_list);
2253
2254        nbd_dbg_close();
2255
2256        mutex_lock(&nbd_index_mutex);
2257        idr_for_each(&nbd_index_idr, &nbd_exit_cb, &del_list);
2258        mutex_unlock(&nbd_index_mutex);
2259
2260        while (!list_empty(&del_list)) {
2261                nbd = list_first_entry(&del_list, struct nbd_device, list);
2262                list_del_init(&nbd->list);
2263                if (refcount_read(&nbd->refs) != 1)
2264                        printk(KERN_ERR "nbd: possibly leaking a device\n");
2265                nbd_put(nbd);
2266        }
2267
2268        idr_destroy(&nbd_index_idr);
2269        genl_unregister_family(&nbd_genl_family);
2270        destroy_workqueue(recv_workqueue);
2271        unregister_blkdev(NBD_MAJOR, "nbd");
2272}
2273
2274module_init(nbd_init);
2275module_exit(nbd_cleanup);
2276
2277MODULE_DESCRIPTION("Network Block Device");
2278MODULE_LICENSE("GPL");
2279
2280module_param(nbds_max, int, 0444);
2281MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
2282module_param(max_part, int, 0444);
2283MODULE_PARM_DESC(max_part, "number of partitions per device (default: 16)");
2284