linux/drivers/block/nbd.c
<<
>>
Prefs
   1/*
   2 * Network block device - make block devices work over TCP
   3 *
   4 * Note that you can not swap over this thing, yet. Seems to work but
   5 * deadlocks sometimes - you can not swap over TCP in general.
   6 * 
   7 * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
   8 * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
   9 *
  10 * This file is released under GPLv2 or later.
  11 *
  12 * (part of code stolen from loop.c)
  13 */
  14
  15#include <linux/major.h>
  16
  17#include <linux/blkdev.h>
  18#include <linux/module.h>
  19#include <linux/init.h>
  20#include <linux/sched.h>
  21#include <linux/sched/mm.h>
  22#include <linux/fs.h>
  23#include <linux/bio.h>
  24#include <linux/stat.h>
  25#include <linux/errno.h>
  26#include <linux/file.h>
  27#include <linux/ioctl.h>
  28#include <linux/mutex.h>
  29#include <linux/compiler.h>
  30#include <linux/err.h>
  31#include <linux/kernel.h>
  32#include <linux/slab.h>
  33#include <net/sock.h>
  34#include <linux/net.h>
  35#include <linux/kthread.h>
  36#include <linux/types.h>
  37#include <linux/debugfs.h>
  38#include <linux/blk-mq.h>
  39
  40#include <linux/uaccess.h>
  41#include <asm/types.h>
  42
  43#include <linux/nbd.h>
  44#include <linux/nbd-netlink.h>
  45#include <net/genetlink.h>
  46
  47static DEFINE_IDR(nbd_index_idr);
  48static DEFINE_MUTEX(nbd_index_mutex);
  49static int nbd_total_devices = 0;
  50
  51struct nbd_sock {
  52        struct socket *sock;
  53        struct mutex tx_lock;
  54        struct request *pending;
  55        int sent;
  56        bool dead;
  57        int fallback_index;
  58        int cookie;
  59};
  60
  61struct recv_thread_args {
  62        struct work_struct work;
  63        struct nbd_device *nbd;
  64        int index;
  65};
  66
  67struct link_dead_args {
  68        struct work_struct work;
  69        int index;
  70};
  71
  72#define NBD_TIMEDOUT                    0
  73#define NBD_DISCONNECT_REQUESTED        1
  74#define NBD_DISCONNECTED                2
  75#define NBD_HAS_PID_FILE                3
  76#define NBD_HAS_CONFIG_REF              4
  77#define NBD_BOUND                       5
  78#define NBD_DESTROY_ON_DISCONNECT       6
  79#define NBD_DISCONNECT_ON_CLOSE         7
  80
  81struct nbd_config {
  82        u32 flags;
  83        unsigned long runtime_flags;
  84        u64 dead_conn_timeout;
  85
  86        struct nbd_sock **socks;
  87        int num_connections;
  88        atomic_t live_connections;
  89        wait_queue_head_t conn_wait;
  90
  91        atomic_t recv_threads;
  92        wait_queue_head_t recv_wq;
  93        loff_t blksize;
  94        loff_t bytesize;
  95#if IS_ENABLED(CONFIG_DEBUG_FS)
  96        struct dentry *dbg_dir;
  97#endif
  98};
  99
 100struct nbd_device {
 101        struct blk_mq_tag_set tag_set;
 102
 103        int index;
 104        refcount_t config_refs;
 105        refcount_t refs;
 106        struct nbd_config *config;
 107        struct mutex config_lock;
 108        struct gendisk *disk;
 109
 110        struct list_head list;
 111        struct task_struct *task_recv;
 112        struct task_struct *task_setup;
 113};
 114
 115#define NBD_CMD_REQUEUED        1
 116
 117struct nbd_cmd {
 118        struct nbd_device *nbd;
 119        struct mutex lock;
 120        int index;
 121        int cookie;
 122        blk_status_t status;
 123        unsigned long flags;
 124        u32 cmd_cookie;
 125};
 126
 127#if IS_ENABLED(CONFIG_DEBUG_FS)
 128static struct dentry *nbd_dbg_dir;
 129#endif
 130
 131#define nbd_name(nbd) ((nbd)->disk->disk_name)
 132
 133#define NBD_MAGIC 0x68797548
 134
 135static unsigned int nbds_max = 16;
 136static int max_part = 16;
 137static struct workqueue_struct *recv_workqueue;
 138static int part_shift;
 139
 140static int nbd_dev_dbg_init(struct nbd_device *nbd);
 141static void nbd_dev_dbg_close(struct nbd_device *nbd);
 142static void nbd_config_put(struct nbd_device *nbd);
 143static void nbd_connect_reply(struct genl_info *info, int index);
 144static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info);
 145static void nbd_dead_link_work(struct work_struct *work);
 146static void nbd_disconnect_and_put(struct nbd_device *nbd);
 147
 148static inline struct device *nbd_to_dev(struct nbd_device *nbd)
 149{
 150        return disk_to_dev(nbd->disk);
 151}
 152
 153static void nbd_requeue_cmd(struct nbd_cmd *cmd)
 154{
 155        struct request *req = blk_mq_rq_from_pdu(cmd);
 156
 157        if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags))
 158                blk_mq_requeue_request(req, true);
 159}
 160
 161#define NBD_COOKIE_BITS 32
 162
 163static u64 nbd_cmd_handle(struct nbd_cmd *cmd)
 164{
 165        struct request *req = blk_mq_rq_from_pdu(cmd);
 166        u32 tag = blk_mq_unique_tag(req);
 167        u64 cookie = cmd->cmd_cookie;
 168
 169        return (cookie << NBD_COOKIE_BITS) | tag;
 170}
 171
 172static u32 nbd_handle_to_tag(u64 handle)
 173{
 174        return (u32)handle;
 175}
 176
 177static u32 nbd_handle_to_cookie(u64 handle)
 178{
 179        return (u32)(handle >> NBD_COOKIE_BITS);
 180}
 181
 182static const char *nbdcmd_to_ascii(int cmd)
 183{
 184        switch (cmd) {
 185        case  NBD_CMD_READ: return "read";
 186        case NBD_CMD_WRITE: return "write";
 187        case  NBD_CMD_DISC: return "disconnect";
 188        case NBD_CMD_FLUSH: return "flush";
 189        case  NBD_CMD_TRIM: return "trim/discard";
 190        }
 191        return "invalid";
 192}
 193
 194static ssize_t pid_show(struct device *dev,
 195                        struct device_attribute *attr, char *buf)
 196{
 197        struct gendisk *disk = dev_to_disk(dev);
 198        struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
 199
 200        return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
 201}
 202
 203static const struct device_attribute pid_attr = {
 204        .attr = { .name = "pid", .mode = 0444},
 205        .show = pid_show,
 206};
 207
 208static void nbd_dev_remove(struct nbd_device *nbd)
 209{
 210        struct gendisk *disk = nbd->disk;
 211        struct request_queue *q;
 212
 213        if (disk) {
 214                q = disk->queue;
 215                del_gendisk(disk);
 216                blk_cleanup_queue(q);
 217                blk_mq_free_tag_set(&nbd->tag_set);
 218                disk->private_data = NULL;
 219                put_disk(disk);
 220        }
 221        kfree(nbd);
 222}
 223
 224static void nbd_put(struct nbd_device *nbd)
 225{
 226        if (refcount_dec_and_mutex_lock(&nbd->refs,
 227                                        &nbd_index_mutex)) {
 228                idr_remove(&nbd_index_idr, nbd->index);
 229                mutex_unlock(&nbd_index_mutex);
 230                nbd_dev_remove(nbd);
 231        }
 232}
 233
 234static int nbd_disconnected(struct nbd_config *config)
 235{
 236        return test_bit(NBD_DISCONNECTED, &config->runtime_flags) ||
 237                test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
 238}
 239
 240static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
 241                                int notify)
 242{
 243        if (!nsock->dead && notify && !nbd_disconnected(nbd->config)) {
 244                struct link_dead_args *args;
 245                args = kmalloc(sizeof(struct link_dead_args), GFP_NOIO);
 246                if (args) {
 247                        INIT_WORK(&args->work, nbd_dead_link_work);
 248                        args->index = nbd->index;
 249                        queue_work(system_wq, &args->work);
 250                }
 251        }
 252        if (!nsock->dead) {
 253                kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
 254                if (atomic_dec_return(&nbd->config->live_connections) == 0) {
 255                        if (test_and_clear_bit(NBD_DISCONNECT_REQUESTED,
 256                                               &nbd->config->runtime_flags)) {
 257                                set_bit(NBD_DISCONNECTED,
 258                                        &nbd->config->runtime_flags);
 259                                dev_info(nbd_to_dev(nbd),
 260                                        "Disconnected due to user request.\n");
 261                        }
 262                }
 263        }
 264        nsock->dead = true;
 265        nsock->pending = NULL;
 266        nsock->sent = 0;
 267}
 268
 269static void nbd_size_clear(struct nbd_device *nbd)
 270{
 271        if (nbd->config->bytesize) {
 272                set_capacity(nbd->disk, 0);
 273                kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
 274        }
 275}
 276
 277static void nbd_size_update(struct nbd_device *nbd)
 278{
 279        struct nbd_config *config = nbd->config;
 280        struct block_device *bdev = bdget_disk(nbd->disk, 0);
 281
 282        if (config->flags & NBD_FLAG_SEND_TRIM) {
 283                nbd->disk->queue->limits.discard_granularity = config->blksize;
 284                nbd->disk->queue->limits.discard_alignment = config->blksize;
 285                blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
 286        }
 287        blk_queue_logical_block_size(nbd->disk->queue, config->blksize);
 288        blk_queue_physical_block_size(nbd->disk->queue, config->blksize);
 289        set_capacity(nbd->disk, config->bytesize >> 9);
 290        if (bdev) {
 291                if (bdev->bd_disk) {
 292                        bd_set_size(bdev, config->bytesize);
 293                        set_blocksize(bdev, config->blksize);
 294                } else
 295                        bdev->bd_invalidated = 1;
 296                bdput(bdev);
 297        }
 298        kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
 299}
 300
 301static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize,
 302                         loff_t nr_blocks)
 303{
 304        struct nbd_config *config = nbd->config;
 305        config->blksize = blocksize;
 306        config->bytesize = blocksize * nr_blocks;
 307        if (nbd->task_recv != NULL)
 308                nbd_size_update(nbd);
 309}
 310
 311static void nbd_complete_rq(struct request *req)
 312{
 313        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 314
 315        dev_dbg(nbd_to_dev(cmd->nbd), "request %p: %s\n", req,
 316                cmd->status ? "failed" : "done");
 317
 318        blk_mq_end_request(req, cmd->status);
 319}
 320
 321/*
 322 * Forcibly shutdown the socket causing all listeners to error
 323 */
 324static void sock_shutdown(struct nbd_device *nbd)
 325{
 326        struct nbd_config *config = nbd->config;
 327        int i;
 328
 329        if (config->num_connections == 0)
 330                return;
 331        if (test_and_set_bit(NBD_DISCONNECTED, &config->runtime_flags))
 332                return;
 333
 334        for (i = 0; i < config->num_connections; i++) {
 335                struct nbd_sock *nsock = config->socks[i];
 336                mutex_lock(&nsock->tx_lock);
 337                nbd_mark_nsock_dead(nbd, nsock, 0);
 338                mutex_unlock(&nsock->tx_lock);
 339        }
 340        dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
 341}
 342
 343static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 344                                                 bool reserved)
 345{
 346        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 347        struct nbd_device *nbd = cmd->nbd;
 348        struct nbd_config *config;
 349
 350        if (!refcount_inc_not_zero(&nbd->config_refs)) {
 351                cmd->status = BLK_STS_TIMEOUT;
 352                goto done;
 353        }
 354        config = nbd->config;
 355
 356        if (!mutex_trylock(&cmd->lock))
 357                return BLK_EH_RESET_TIMER;
 358
 359        if (config->num_connections > 1) {
 360                dev_err_ratelimited(nbd_to_dev(nbd),
 361                                    "Connection timed out, retrying (%d/%d alive)\n",
 362                                    atomic_read(&config->live_connections),
 363                                    config->num_connections);
 364                /*
 365                 * Hooray we have more connections, requeue this IO, the submit
 366                 * path will put it on a real connection.
 367                 */
 368                if (config->socks && config->num_connections > 1) {
 369                        if (cmd->index < config->num_connections) {
 370                                struct nbd_sock *nsock =
 371                                        config->socks[cmd->index];
 372                                mutex_lock(&nsock->tx_lock);
 373                                /* We can have multiple outstanding requests, so
 374                                 * we don't want to mark the nsock dead if we've
 375                                 * already reconnected with a new socket, so
 376                                 * only mark it dead if its the same socket we
 377                                 * were sent out on.
 378                                 */
 379                                if (cmd->cookie == nsock->cookie)
 380                                        nbd_mark_nsock_dead(nbd, nsock, 1);
 381                                mutex_unlock(&nsock->tx_lock);
 382                        }
 383                        mutex_unlock(&cmd->lock);
 384                        nbd_requeue_cmd(cmd);
 385                        nbd_config_put(nbd);
 386                        return BLK_EH_DONE;
 387                }
 388        } else {
 389                dev_err_ratelimited(nbd_to_dev(nbd),
 390                                    "Connection timed out\n");
 391        }
 392        set_bit(NBD_TIMEDOUT, &config->runtime_flags);
 393        cmd->status = BLK_STS_IOERR;
 394        mutex_unlock(&cmd->lock);
 395        sock_shutdown(nbd);
 396        nbd_config_put(nbd);
 397done:
 398        blk_mq_complete_request(req);
 399        return BLK_EH_DONE;
 400}
 401
 402/*
 403 *  Send or receive packet.
 404 */
 405static int sock_xmit(struct nbd_device *nbd, int index, int send,
 406                     struct iov_iter *iter, int msg_flags, int *sent)
 407{
 408        struct nbd_config *config = nbd->config;
 409        struct socket *sock = config->socks[index]->sock;
 410        int result;
 411        struct msghdr msg;
 412        unsigned int noreclaim_flag;
 413
 414        if (unlikely(!sock)) {
 415                dev_err_ratelimited(disk_to_dev(nbd->disk),
 416                        "Attempted %s on closed socket in sock_xmit\n",
 417                        (send ? "send" : "recv"));
 418                return -EINVAL;
 419        }
 420
 421        msg.msg_iter = *iter;
 422
 423        noreclaim_flag = memalloc_noreclaim_save();
 424        do {
 425                sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
 426                msg.msg_name = NULL;
 427                msg.msg_namelen = 0;
 428                msg.msg_control = NULL;
 429                msg.msg_controllen = 0;
 430                msg.msg_flags = msg_flags | MSG_NOSIGNAL;
 431
 432                if (send)
 433                        result = sock_sendmsg(sock, &msg);
 434                else
 435                        result = sock_recvmsg(sock, &msg, msg.msg_flags);
 436
 437                if (result <= 0) {
 438                        if (result == 0)
 439                                result = -EPIPE; /* short read */
 440                        break;
 441                }
 442                if (sent)
 443                        *sent += result;
 444        } while (msg_data_left(&msg));
 445
 446        memalloc_noreclaim_restore(noreclaim_flag);
 447
 448        return result;
 449}
 450
 451/*
 452 * Different settings for sk->sk_sndtimeo can result in different return values
 453 * if there is a signal pending when we enter sendmsg, because reasons?
 454 */
 455static inline int was_interrupted(int result)
 456{
 457        return result == -ERESTARTSYS || result == -EINTR;
 458}
 459
 460/* always call with the tx_lock held */
 461static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 462{
 463        struct request *req = blk_mq_rq_from_pdu(cmd);
 464        struct nbd_config *config = nbd->config;
 465        struct nbd_sock *nsock = config->socks[index];
 466        int result;
 467        struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
 468        struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
 469        struct iov_iter from;
 470        unsigned long size = blk_rq_bytes(req);
 471        struct bio *bio;
 472        u64 handle;
 473        u32 type;
 474        u32 nbd_cmd_flags = 0;
 475        int sent = nsock->sent, skip = 0;
 476
 477        iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request));
 478
 479        switch (req_op(req)) {
 480        case REQ_OP_DISCARD:
 481                type = NBD_CMD_TRIM;
 482                break;
 483        case REQ_OP_FLUSH:
 484                type = NBD_CMD_FLUSH;
 485                break;
 486        case REQ_OP_WRITE:
 487                type = NBD_CMD_WRITE;
 488                break;
 489        case REQ_OP_READ:
 490                type = NBD_CMD_READ;
 491                break;
 492        default:
 493                return -EIO;
 494        }
 495
 496        if (rq_data_dir(req) == WRITE &&
 497            (config->flags & NBD_FLAG_READ_ONLY)) {
 498                dev_err_ratelimited(disk_to_dev(nbd->disk),
 499                                    "Write on read-only\n");
 500                return -EIO;
 501        }
 502
 503        if (req->cmd_flags & REQ_FUA)
 504                nbd_cmd_flags |= NBD_CMD_FLAG_FUA;
 505
 506        /* We did a partial send previously, and we at least sent the whole
 507         * request struct, so just go and send the rest of the pages in the
 508         * request.
 509         */
 510        if (sent) {
 511                if (sent >= sizeof(request)) {
 512                        skip = sent - sizeof(request);
 513                        goto send_pages;
 514                }
 515                iov_iter_advance(&from, sent);
 516        } else {
 517                cmd->cmd_cookie++;
 518        }
 519        cmd->index = index;
 520        cmd->cookie = nsock->cookie;
 521        request.type = htonl(type | nbd_cmd_flags);
 522        if (type != NBD_CMD_FLUSH) {
 523                request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
 524                request.len = htonl(size);
 525        }
 526        handle = nbd_cmd_handle(cmd);
 527        memcpy(request.handle, &handle, sizeof(handle));
 528
 529        dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
 530                req, nbdcmd_to_ascii(type),
 531                (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
 532        result = sock_xmit(nbd, index, 1, &from,
 533                        (type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
 534        if (result <= 0) {
 535                if (was_interrupted(result)) {
 536                        /* If we havne't sent anything we can just return BUSY,
 537                         * however if we have sent something we need to make
 538                         * sure we only allow this req to be sent until we are
 539                         * completely done.
 540                         */
 541                        if (sent) {
 542                                nsock->pending = req;
 543                                nsock->sent = sent;
 544                        }
 545                        set_bit(NBD_CMD_REQUEUED, &cmd->flags);
 546                        return BLK_STS_RESOURCE;
 547                }
 548                dev_err_ratelimited(disk_to_dev(nbd->disk),
 549                        "Send control failed (result %d)\n", result);
 550                return -EAGAIN;
 551        }
 552send_pages:
 553        if (type != NBD_CMD_WRITE)
 554                goto out;
 555
 556        bio = req->bio;
 557        while (bio) {
 558                struct bio *next = bio->bi_next;
 559                struct bvec_iter iter;
 560                struct bio_vec bvec;
 561
 562                bio_for_each_segment(bvec, bio, iter) {
 563                        bool is_last = !next && bio_iter_last(bvec, iter);
 564                        int flags = is_last ? 0 : MSG_MORE;
 565
 566                        dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
 567                                req, bvec.bv_len);
 568                        iov_iter_bvec(&from, ITER_BVEC | WRITE,
 569                                      &bvec, 1, bvec.bv_len);
 570                        if (skip) {
 571                                if (skip >= iov_iter_count(&from)) {
 572                                        skip -= iov_iter_count(&from);
 573                                        continue;
 574                                }
 575                                iov_iter_advance(&from, skip);
 576                                skip = 0;
 577                        }
 578                        result = sock_xmit(nbd, index, 1, &from, flags, &sent);
 579                        if (result <= 0) {
 580                                if (was_interrupted(result)) {
 581                                        /* We've already sent the header, we
 582                                         * have no choice but to set pending and
 583                                         * return BUSY.
 584                                         */
 585                                        nsock->pending = req;
 586                                        nsock->sent = sent;
 587                                        set_bit(NBD_CMD_REQUEUED, &cmd->flags);
 588                                        return BLK_STS_RESOURCE;
 589                                }
 590                                dev_err(disk_to_dev(nbd->disk),
 591                                        "Send data failed (result %d)\n",
 592                                        result);
 593                                return -EAGAIN;
 594                        }
 595                        /*
 596                         * The completion might already have come in,
 597                         * so break for the last one instead of letting
 598                         * the iterator do it. This prevents use-after-free
 599                         * of the bio.
 600                         */
 601                        if (is_last)
 602                                break;
 603                }
 604                bio = next;
 605        }
 606out:
 607        nsock->pending = NULL;
 608        nsock->sent = 0;
 609        return 0;
 610}
 611
 612/* NULL returned = something went wrong, inform userspace */
 613static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
 614{
 615        struct nbd_config *config = nbd->config;
 616        int result;
 617        struct nbd_reply reply;
 618        struct nbd_cmd *cmd;
 619        struct request *req = NULL;
 620        u64 handle;
 621        u16 hwq;
 622        u32 tag;
 623        struct kvec iov = {.iov_base = &reply, .iov_len = sizeof(reply)};
 624        struct iov_iter to;
 625        int ret = 0;
 626
 627        reply.magic = 0;
 628        iov_iter_kvec(&to, READ | ITER_KVEC, &iov, 1, sizeof(reply));
 629        result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
 630        if (result <= 0) {
 631                if (!nbd_disconnected(config))
 632                        dev_err(disk_to_dev(nbd->disk),
 633                                "Receive control failed (result %d)\n", result);
 634                return ERR_PTR(result);
 635        }
 636
 637        if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
 638                dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
 639                                (unsigned long)ntohl(reply.magic));
 640                return ERR_PTR(-EPROTO);
 641        }
 642
 643        memcpy(&handle, reply.handle, sizeof(handle));
 644        tag = nbd_handle_to_tag(handle);
 645        hwq = blk_mq_unique_tag_to_hwq(tag);
 646        if (hwq < nbd->tag_set.nr_hw_queues)
 647                req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq],
 648                                       blk_mq_unique_tag_to_tag(tag));
 649        if (!req || !blk_mq_request_started(req)) {
 650                dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n",
 651                        tag, req);
 652                return ERR_PTR(-ENOENT);
 653        }
 654        cmd = blk_mq_rq_to_pdu(req);
 655
 656        mutex_lock(&cmd->lock);
 657        if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) {
 658                dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n",
 659                        req, cmd->cmd_cookie, nbd_handle_to_cookie(handle));
 660                ret = -ENOENT;
 661                goto out;
 662        }
 663        if (test_bit(NBD_CMD_REQUEUED, &cmd->flags)) {
 664                dev_err(disk_to_dev(nbd->disk), "Raced with timeout on req %p\n",
 665                        req);
 666                ret = -ENOENT;
 667                goto out;
 668        }
 669        if (ntohl(reply.error)) {
 670                dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
 671                        ntohl(reply.error));
 672                cmd->status = BLK_STS_IOERR;
 673                goto out;
 674        }
 675
 676        dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
 677        if (rq_data_dir(req) != WRITE) {
 678                struct req_iterator iter;
 679                struct bio_vec bvec;
 680
 681                rq_for_each_segment(bvec, req, iter) {
 682                        iov_iter_bvec(&to, ITER_BVEC | READ,
 683                                      &bvec, 1, bvec.bv_len);
 684                        result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
 685                        if (result <= 0) {
 686                                dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
 687                                        result);
 688                                /*
 689                                 * If we've disconnected or we only have 1
 690                                 * connection then we need to make sure we
 691                                 * complete this request, otherwise error out
 692                                 * and let the timeout stuff handle resubmitting
 693                                 * this request onto another connection.
 694                                 */
 695                                if (nbd_disconnected(config) ||
 696                                    config->num_connections <= 1) {
 697                                        cmd->status = BLK_STS_IOERR;
 698                                        goto out;
 699                                }
 700                                ret = -EIO;
 701                                goto out;
 702                        }
 703                        dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
 704                                req, bvec.bv_len);
 705                }
 706        }
 707out:
 708        mutex_unlock(&cmd->lock);
 709        return ret ? ERR_PTR(ret) : cmd;
 710}
 711
 712static void recv_work(struct work_struct *work)
 713{
 714        struct recv_thread_args *args = container_of(work,
 715                                                     struct recv_thread_args,
 716                                                     work);
 717        struct nbd_device *nbd = args->nbd;
 718        struct nbd_config *config = nbd->config;
 719        struct nbd_cmd *cmd;
 720
 721        while (1) {
 722                cmd = nbd_read_stat(nbd, args->index);
 723                if (IS_ERR(cmd)) {
 724                        struct nbd_sock *nsock = config->socks[args->index];
 725
 726                        mutex_lock(&nsock->tx_lock);
 727                        nbd_mark_nsock_dead(nbd, nsock, 1);
 728                        mutex_unlock(&nsock->tx_lock);
 729                        break;
 730                }
 731
 732                blk_mq_complete_request(blk_mq_rq_from_pdu(cmd));
 733        }
 734        atomic_dec(&config->recv_threads);
 735        wake_up(&config->recv_wq);
 736        nbd_config_put(nbd);
 737        kfree(args);
 738}
 739
 740static bool nbd_clear_req(struct request *req, void *data, bool reserved)
 741{
 742        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 743
 744        cmd->status = BLK_STS_IOERR;
 745        blk_mq_complete_request(req);
 746        return true;
 747}
 748
 749static void nbd_clear_que(struct nbd_device *nbd)
 750{
 751        blk_mq_quiesce_queue(nbd->disk->queue);
 752        blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
 753        blk_mq_unquiesce_queue(nbd->disk->queue);
 754        dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
 755}
 756
 757static int find_fallback(struct nbd_device *nbd, int index)
 758{
 759        struct nbd_config *config = nbd->config;
 760        int new_index = -1;
 761        struct nbd_sock *nsock = config->socks[index];
 762        int fallback = nsock->fallback_index;
 763
 764        if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
 765                return new_index;
 766
 767        if (config->num_connections <= 1) {
 768                dev_err_ratelimited(disk_to_dev(nbd->disk),
 769                                    "Attempted send on invalid socket\n");
 770                return new_index;
 771        }
 772
 773        if (fallback >= 0 && fallback < config->num_connections &&
 774            !config->socks[fallback]->dead)
 775                return fallback;
 776
 777        if (nsock->fallback_index < 0 ||
 778            nsock->fallback_index >= config->num_connections ||
 779            config->socks[nsock->fallback_index]->dead) {
 780                int i;
 781                for (i = 0; i < config->num_connections; i++) {
 782                        if (i == index)
 783                                continue;
 784                        if (!config->socks[i]->dead) {
 785                                new_index = i;
 786                                break;
 787                        }
 788                }
 789                nsock->fallback_index = new_index;
 790                if (new_index < 0) {
 791                        dev_err_ratelimited(disk_to_dev(nbd->disk),
 792                                            "Dead connection, failed to find a fallback\n");
 793                        return new_index;
 794                }
 795        }
 796        new_index = nsock->fallback_index;
 797        return new_index;
 798}
 799
 800static int wait_for_reconnect(struct nbd_device *nbd)
 801{
 802        struct nbd_config *config = nbd->config;
 803        if (!config->dead_conn_timeout)
 804                return 0;
 805        if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
 806                return 0;
 807        return wait_event_timeout(config->conn_wait,
 808                                  atomic_read(&config->live_connections) > 0,
 809                                  config->dead_conn_timeout) > 0;
 810}
 811
 812static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
 813{
 814        struct request *req = blk_mq_rq_from_pdu(cmd);
 815        struct nbd_device *nbd = cmd->nbd;
 816        struct nbd_config *config;
 817        struct nbd_sock *nsock;
 818        int ret;
 819
 820        if (!refcount_inc_not_zero(&nbd->config_refs)) {
 821                dev_err_ratelimited(disk_to_dev(nbd->disk),
 822                                    "Socks array is empty\n");
 823                blk_mq_start_request(req);
 824                return -EINVAL;
 825        }
 826        config = nbd->config;
 827
 828        if (index >= config->num_connections) {
 829                dev_err_ratelimited(disk_to_dev(nbd->disk),
 830                                    "Attempted send on invalid socket\n");
 831                nbd_config_put(nbd);
 832                blk_mq_start_request(req);
 833                return -EINVAL;
 834        }
 835        cmd->status = BLK_STS_OK;
 836again:
 837        nsock = config->socks[index];
 838        mutex_lock(&nsock->tx_lock);
 839        if (nsock->dead) {
 840                int old_index = index;
 841                index = find_fallback(nbd, index);
 842                mutex_unlock(&nsock->tx_lock);
 843                if (index < 0) {
 844                        if (wait_for_reconnect(nbd)) {
 845                                index = old_index;
 846                                goto again;
 847                        }
 848                        /* All the sockets should already be down at this point,
 849                         * we just want to make sure that DISCONNECTED is set so
 850                         * any requests that come in that were queue'ed waiting
 851                         * for the reconnect timer don't trigger the timer again
 852                         * and instead just error out.
 853                         */
 854                        sock_shutdown(nbd);
 855                        nbd_config_put(nbd);
 856                        blk_mq_start_request(req);
 857                        return -EIO;
 858                }
 859                goto again;
 860        }
 861
 862        /* Handle the case that we have a pending request that was partially
 863         * transmitted that _has_ to be serviced first.  We need to call requeue
 864         * here so that it gets put _after_ the request that is already on the
 865         * dispatch list.
 866         */
 867        blk_mq_start_request(req);
 868        if (unlikely(nsock->pending && nsock->pending != req)) {
 869                nbd_requeue_cmd(cmd);
 870                ret = 0;
 871                goto out;
 872        }
 873        /*
 874         * Some failures are related to the link going down, so anything that
 875         * returns EAGAIN can be retried on a different socket.
 876         */
 877        ret = nbd_send_cmd(nbd, cmd, index);
 878        if (ret == -EAGAIN) {
 879                dev_err_ratelimited(disk_to_dev(nbd->disk),
 880                                    "Request send failed, requeueing\n");
 881                nbd_mark_nsock_dead(nbd, nsock, 1);
 882                nbd_requeue_cmd(cmd);
 883                ret = 0;
 884        }
 885out:
 886        mutex_unlock(&nsock->tx_lock);
 887        nbd_config_put(nbd);
 888        return ret;
 889}
 890
 891static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
 892                        const struct blk_mq_queue_data *bd)
 893{
 894        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
 895        int ret;
 896
 897        /*
 898         * Since we look at the bio's to send the request over the network we
 899         * need to make sure the completion work doesn't mark this request done
 900         * before we are done doing our send.  This keeps us from dereferencing
 901         * freed data if we have particularly fast completions (ie we get the
 902         * completion before we exit sock_xmit on the last bvec) or in the case
 903         * that the server is misbehaving (or there was an error) before we're
 904         * done sending everything over the wire.
 905         */
 906        mutex_lock(&cmd->lock);
 907        clear_bit(NBD_CMD_REQUEUED, &cmd->flags);
 908
 909        /* We can be called directly from the user space process, which means we
 910         * could possibly have signals pending so our sendmsg will fail.  In
 911         * this case we need to return that we are busy, otherwise error out as
 912         * appropriate.
 913         */
 914        ret = nbd_handle_cmd(cmd, hctx->queue_num);
 915        if (ret < 0)
 916                ret = BLK_STS_IOERR;
 917        else if (!ret)
 918                ret = BLK_STS_OK;
 919        mutex_unlock(&cmd->lock);
 920
 921        return ret;
 922}
 923
 924static int nbd_check_sock_type(struct nbd_device *nbd, struct socket *sock)
 925{
 926        struct sockaddr addr;
 927        int err;
 928
 929        err = kernel_getsockname(sock, &addr);
 930        if (err < 0)
 931                return err;
 932
 933        if (addr.sa_family != AF_UNIX) {
 934                dev_err(disk_to_dev(nbd->disk),
 935                        "Only AF_UNIX sockets are supported.\n");
 936                return -EINVAL;
 937        }
 938
 939        return 0;
 940}
 941
 942static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
 943                          bool netlink)
 944{
 945        struct nbd_config *config = nbd->config;
 946        struct socket *sock;
 947        struct nbd_sock **socks;
 948        struct nbd_sock *nsock;
 949        int err;
 950
 951        sock = sockfd_lookup(arg, &err);
 952        if (!sock)
 953                return err;
 954
 955        err = nbd_check_sock_type(nbd, sock);
 956        if (err) {
 957                sockfd_put(sock);
 958                return err;
 959        }
 960
 961        if (!netlink && !nbd->task_setup &&
 962            !test_bit(NBD_BOUND, &config->runtime_flags))
 963                nbd->task_setup = current;
 964
 965        if (!netlink &&
 966            (nbd->task_setup != current ||
 967             test_bit(NBD_BOUND, &config->runtime_flags))) {
 968                dev_err(disk_to_dev(nbd->disk),
 969                        "Device being setup by another task");
 970                sockfd_put(sock);
 971                return -EBUSY;
 972        }
 973
 974        socks = krealloc(config->socks, (config->num_connections + 1) *
 975                         sizeof(struct nbd_sock *), GFP_KERNEL);
 976        if (!socks) {
 977                sockfd_put(sock);
 978                return -ENOMEM;
 979        }
 980        nsock = kzalloc(sizeof(struct nbd_sock), GFP_KERNEL);
 981        if (!nsock) {
 982                sockfd_put(sock);
 983                return -ENOMEM;
 984        }
 985
 986        config->socks = socks;
 987
 988        nsock->fallback_index = -1;
 989        nsock->dead = false;
 990        mutex_init(&nsock->tx_lock);
 991        nsock->sock = sock;
 992        nsock->pending = NULL;
 993        nsock->sent = 0;
 994        nsock->cookie = 0;
 995        socks[config->num_connections++] = nsock;
 996        atomic_inc(&config->live_connections);
 997
 998        return 0;
 999}
1000
1001static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
1002{
1003        struct nbd_config *config = nbd->config;
1004        struct socket *sock, *old;
1005        struct recv_thread_args *args;
1006        int i;
1007        int err;
1008
1009        sock = sockfd_lookup(arg, &err);
1010        if (!sock)
1011                return err;
1012
1013        args = kzalloc(sizeof(*args), GFP_KERNEL);
1014        if (!args) {
1015                sockfd_put(sock);
1016                return -ENOMEM;
1017        }
1018
1019        for (i = 0; i < config->num_connections; i++) {
1020                struct nbd_sock *nsock = config->socks[i];
1021
1022                if (!nsock->dead)
1023                        continue;
1024
1025                mutex_lock(&nsock->tx_lock);
1026                if (!nsock->dead) {
1027                        mutex_unlock(&nsock->tx_lock);
1028                        continue;
1029                }
1030                sk_set_memalloc(sock->sk);
1031                if (nbd->tag_set.timeout)
1032                        sock->sk->sk_sndtimeo = nbd->tag_set.timeout;
1033                atomic_inc(&config->recv_threads);
1034                refcount_inc(&nbd->config_refs);
1035                old = nsock->sock;
1036                nsock->fallback_index = -1;
1037                nsock->sock = sock;
1038                nsock->dead = false;
1039                INIT_WORK(&args->work, recv_work);
1040                args->index = i;
1041                args->nbd = nbd;
1042                nsock->cookie++;
1043                mutex_unlock(&nsock->tx_lock);
1044                sockfd_put(old);
1045
1046                clear_bit(NBD_DISCONNECTED, &config->runtime_flags);
1047
1048                /* We take the tx_mutex in an error path in the recv_work, so we
1049                 * need to queue_work outside of the tx_mutex.
1050                 */
1051                queue_work(recv_workqueue, &args->work);
1052
1053                atomic_inc(&config->live_connections);
1054                wake_up(&config->conn_wait);
1055                return 0;
1056        }
1057        sockfd_put(sock);
1058        kfree(args);
1059        return -ENOSPC;
1060}
1061
1062static void nbd_bdev_reset(struct block_device *bdev)
1063{
1064        if (bdev->bd_openers > 1)
1065                return;
1066        bd_set_size(bdev, 0);
1067}
1068
1069static void nbd_parse_flags(struct nbd_device *nbd)
1070{
1071        struct nbd_config *config = nbd->config;
1072        if (config->flags & NBD_FLAG_READ_ONLY)
1073                set_disk_ro(nbd->disk, true);
1074        else
1075                set_disk_ro(nbd->disk, false);
1076        if (config->flags & NBD_FLAG_SEND_TRIM)
1077                blk_queue_flag_set(QUEUE_FLAG_DISCARD, nbd->disk->queue);
1078        if (config->flags & NBD_FLAG_SEND_FLUSH) {
1079                if (config->flags & NBD_FLAG_SEND_FUA)
1080                        blk_queue_write_cache(nbd->disk->queue, true, true);
1081                else
1082                        blk_queue_write_cache(nbd->disk->queue, true, false);
1083        }
1084        else
1085                blk_queue_write_cache(nbd->disk->queue, false, false);
1086}
1087
1088static void send_disconnects(struct nbd_device *nbd)
1089{
1090        struct nbd_config *config = nbd->config;
1091        struct nbd_request request = {
1092                .magic = htonl(NBD_REQUEST_MAGIC),
1093                .type = htonl(NBD_CMD_DISC),
1094        };
1095        struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
1096        struct iov_iter from;
1097        int i, ret;
1098
1099        for (i = 0; i < config->num_connections; i++) {
1100                struct nbd_sock *nsock = config->socks[i];
1101
1102                iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request));
1103                mutex_lock(&nsock->tx_lock);
1104                ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
1105                if (ret <= 0)
1106                        dev_err(disk_to_dev(nbd->disk),
1107                                "Send disconnect failed %d\n", ret);
1108                mutex_unlock(&nsock->tx_lock);
1109        }
1110}
1111
1112static int nbd_disconnect(struct nbd_device *nbd)
1113{
1114        struct nbd_config *config = nbd->config;
1115
1116        dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
1117        set_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
1118        send_disconnects(nbd);
1119        return 0;
1120}
1121
1122static void nbd_clear_sock(struct nbd_device *nbd)
1123{
1124        sock_shutdown(nbd);
1125        nbd_clear_que(nbd);
1126        nbd->task_setup = NULL;
1127}
1128
1129static void nbd_config_put(struct nbd_device *nbd)
1130{
1131        if (refcount_dec_and_mutex_lock(&nbd->config_refs,
1132                                        &nbd->config_lock)) {
1133                struct nbd_config *config = nbd->config;
1134                nbd_dev_dbg_close(nbd);
1135                nbd_size_clear(nbd);
1136                if (test_and_clear_bit(NBD_HAS_PID_FILE,
1137                                       &config->runtime_flags))
1138                        device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
1139                nbd->task_recv = NULL;
1140                nbd_clear_sock(nbd);
1141                if (config->num_connections) {
1142                        int i;
1143                        for (i = 0; i < config->num_connections; i++) {
1144                                sockfd_put(config->socks[i]->sock);
1145                                kfree(config->socks[i]);
1146                        }
1147                        kfree(config->socks);
1148                }
1149                kfree(nbd->config);
1150                nbd->config = NULL;
1151
1152                nbd->tag_set.timeout = 0;
1153                nbd->disk->queue->limits.discard_granularity = 0;
1154                nbd->disk->queue->limits.discard_alignment = 0;
1155                blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
1156                blk_queue_flag_clear(QUEUE_FLAG_DISCARD, nbd->disk->queue);
1157
1158                mutex_unlock(&nbd->config_lock);
1159                nbd_put(nbd);
1160                module_put(THIS_MODULE);
1161        }
1162}
1163
1164static int nbd_start_device(struct nbd_device *nbd)
1165{
1166        struct nbd_config *config = nbd->config;
1167        int num_connections = config->num_connections;
1168        int error = 0, i;
1169
1170        if (nbd->task_recv)
1171                return -EBUSY;
1172        if (!config->socks)
1173                return -EINVAL;
1174        if (num_connections > 1 &&
1175            !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) {
1176                dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
1177                return -EINVAL;
1178        }
1179
1180        blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
1181        nbd->task_recv = current;
1182
1183        nbd_parse_flags(nbd);
1184
1185        error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
1186        if (error) {
1187                dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
1188                return error;
1189        }
1190        set_bit(NBD_HAS_PID_FILE, &config->runtime_flags);
1191
1192        nbd_dev_dbg_init(nbd);
1193        for (i = 0; i < num_connections; i++) {
1194                struct recv_thread_args *args;
1195
1196                args = kzalloc(sizeof(*args), GFP_KERNEL);
1197                if (!args) {
1198                        sock_shutdown(nbd);
1199                        return -ENOMEM;
1200                }
1201                sk_set_memalloc(config->socks[i]->sock->sk);
1202                if (nbd->tag_set.timeout)
1203                        config->socks[i]->sock->sk->sk_sndtimeo =
1204                                nbd->tag_set.timeout;
1205                atomic_inc(&config->recv_threads);
1206                refcount_inc(&nbd->config_refs);
1207                INIT_WORK(&args->work, recv_work);
1208                args->nbd = nbd;
1209                args->index = i;
1210                queue_work(recv_workqueue, &args->work);
1211        }
1212        nbd_size_update(nbd);
1213        return error;
1214}
1215
1216static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev)
1217{
1218        struct nbd_config *config = nbd->config;
1219        int ret;
1220
1221        ret = nbd_start_device(nbd);
1222        if (ret)
1223                return ret;
1224
1225        if (max_part)
1226                bdev->bd_invalidated = 1;
1227        mutex_unlock(&nbd->config_lock);
1228        ret = wait_event_interruptible(config->recv_wq,
1229                                         atomic_read(&config->recv_threads) == 0);
1230        if (ret)
1231                sock_shutdown(nbd);
1232        mutex_lock(&nbd->config_lock);
1233        nbd_bdev_reset(bdev);
1234        /* user requested, ignore socket errors */
1235        if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags))
1236                ret = 0;
1237        if (test_bit(NBD_TIMEDOUT, &config->runtime_flags))
1238                ret = -ETIMEDOUT;
1239        return ret;
1240}
1241
1242static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
1243                                 struct block_device *bdev)
1244{
1245        sock_shutdown(nbd);
1246        kill_bdev(bdev);
1247        nbd_bdev_reset(bdev);
1248        if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
1249                               &nbd->config->runtime_flags))
1250                nbd_config_put(nbd);
1251}
1252
1253/* Must be called with config_lock held */
1254static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
1255                       unsigned int cmd, unsigned long arg)
1256{
1257        struct nbd_config *config = nbd->config;
1258
1259        switch (cmd) {
1260        case NBD_DISCONNECT:
1261                return nbd_disconnect(nbd);
1262        case NBD_CLEAR_SOCK:
1263                nbd_clear_sock_ioctl(nbd, bdev);
1264                return 0;
1265        case NBD_SET_SOCK:
1266                return nbd_add_socket(nbd, arg, false);
1267        case NBD_SET_BLKSIZE:
1268                if (!arg || !is_power_of_2(arg) || arg < 512 ||
1269                    arg > PAGE_SIZE)
1270                        return -EINVAL;
1271                nbd_size_set(nbd, arg,
1272                             div_s64(config->bytesize, arg));
1273                return 0;
1274        case NBD_SET_SIZE:
1275                nbd_size_set(nbd, config->blksize,
1276                             div_s64(arg, config->blksize));
1277                return 0;
1278        case NBD_SET_SIZE_BLOCKS:
1279                nbd_size_set(nbd, config->blksize, arg);
1280                return 0;
1281        case NBD_SET_TIMEOUT:
1282                if (arg) {
1283                        nbd->tag_set.timeout = arg * HZ;
1284                        blk_queue_rq_timeout(nbd->disk->queue, arg * HZ);
1285                }
1286                return 0;
1287
1288        case NBD_SET_FLAGS:
1289                config->flags = arg;
1290                return 0;
1291        case NBD_DO_IT:
1292                return nbd_start_device_ioctl(nbd, bdev);
1293        case NBD_CLEAR_QUE:
1294                /*
1295                 * This is for compatibility only.  The queue is always cleared
1296                 * by NBD_DO_IT or NBD_CLEAR_SOCK.
1297                 */
1298                return 0;
1299        case NBD_PRINT_DEBUG:
1300                /*
1301                 * For compatibility only, we no longer keep a list of
1302                 * outstanding requests.
1303                 */
1304                return 0;
1305        }
1306        return -ENOTTY;
1307}
1308
1309static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
1310                     unsigned int cmd, unsigned long arg)
1311{
1312        struct nbd_device *nbd = bdev->bd_disk->private_data;
1313        struct nbd_config *config = nbd->config;
1314        int error = -EINVAL;
1315
1316        if (!capable(CAP_SYS_ADMIN))
1317                return -EPERM;
1318
1319        /* The block layer will pass back some non-nbd ioctls in case we have
1320         * special handling for them, but we don't so just return an error.
1321         */
1322        if (_IOC_TYPE(cmd) != 0xab)
1323                return -EINVAL;
1324
1325        mutex_lock(&nbd->config_lock);
1326
1327        /* Don't allow ioctl operations on a nbd device that was created with
1328         * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine.
1329         */
1330        if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
1331            (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK))
1332                error = __nbd_ioctl(bdev, nbd, cmd, arg);
1333        else
1334                dev_err(nbd_to_dev(nbd), "Cannot use ioctl interface on a netlink controlled device.\n");
1335        mutex_unlock(&nbd->config_lock);
1336        return error;
1337}
1338
1339static struct nbd_config *nbd_alloc_config(void)
1340{
1341        struct nbd_config *config;
1342
1343        config = kzalloc(sizeof(struct nbd_config), GFP_NOFS);
1344        if (!config)
1345                return NULL;
1346        atomic_set(&config->recv_threads, 0);
1347        init_waitqueue_head(&config->recv_wq);
1348        init_waitqueue_head(&config->conn_wait);
1349        config->blksize = 1024;
1350        atomic_set(&config->live_connections, 0);
1351        try_module_get(THIS_MODULE);
1352        return config;
1353}
1354
1355static int nbd_open(struct block_device *bdev, fmode_t mode)
1356{
1357        struct nbd_device *nbd;
1358        int ret = 0;
1359
1360        mutex_lock(&nbd_index_mutex);
1361        nbd = bdev->bd_disk->private_data;
1362        if (!nbd) {
1363                ret = -ENXIO;
1364                goto out;
1365        }
1366        if (!refcount_inc_not_zero(&nbd->refs)) {
1367                ret = -ENXIO;
1368                goto out;
1369        }
1370        if (!refcount_inc_not_zero(&nbd->config_refs)) {
1371                struct nbd_config *config;
1372
1373                mutex_lock(&nbd->config_lock);
1374                if (refcount_inc_not_zero(&nbd->config_refs)) {
1375                        mutex_unlock(&nbd->config_lock);
1376                        goto out;
1377                }
1378                config = nbd->config = nbd_alloc_config();
1379                if (!config) {
1380                        ret = -ENOMEM;
1381                        mutex_unlock(&nbd->config_lock);
1382                        goto out;
1383                }
1384                refcount_set(&nbd->config_refs, 1);
1385                refcount_inc(&nbd->refs);
1386                mutex_unlock(&nbd->config_lock);
1387                bdev->bd_invalidated = 1;
1388        } else if (nbd_disconnected(nbd->config)) {
1389                bdev->bd_invalidated = 1;
1390        }
1391out:
1392        mutex_unlock(&nbd_index_mutex);
1393        return ret;
1394}
1395
1396static void nbd_release(struct gendisk *disk, fmode_t mode)
1397{
1398        struct nbd_device *nbd = disk->private_data;
1399        struct block_device *bdev = bdget_disk(disk, 0);
1400
1401        if (test_bit(NBD_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) &&
1402                        bdev->bd_openers == 0)
1403                nbd_disconnect_and_put(nbd);
1404
1405        nbd_config_put(nbd);
1406        nbd_put(nbd);
1407}
1408
1409static const struct block_device_operations nbd_fops =
1410{
1411        .owner =        THIS_MODULE,
1412        .open =         nbd_open,
1413        .release =      nbd_release,
1414        .ioctl =        nbd_ioctl,
1415        .compat_ioctl = nbd_ioctl,
1416};
1417
1418#if IS_ENABLED(CONFIG_DEBUG_FS)
1419
1420static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
1421{
1422        struct nbd_device *nbd = s->private;
1423
1424        if (nbd->task_recv)
1425                seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
1426
1427        return 0;
1428}
1429
1430static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
1431{
1432        return single_open(file, nbd_dbg_tasks_show, inode->i_private);
1433}
1434
1435static const struct file_operations nbd_dbg_tasks_ops = {
1436        .open = nbd_dbg_tasks_open,
1437        .read = seq_read,
1438        .llseek = seq_lseek,
1439        .release = single_release,
1440};
1441
1442static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
1443{
1444        struct nbd_device *nbd = s->private;
1445        u32 flags = nbd->config->flags;
1446
1447        seq_printf(s, "Hex: 0x%08x\n\n", flags);
1448
1449        seq_puts(s, "Known flags:\n");
1450
1451        if (flags & NBD_FLAG_HAS_FLAGS)
1452                seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
1453        if (flags & NBD_FLAG_READ_ONLY)
1454                seq_puts(s, "NBD_FLAG_READ_ONLY\n");
1455        if (flags & NBD_FLAG_SEND_FLUSH)
1456                seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
1457        if (flags & NBD_FLAG_SEND_FUA)
1458                seq_puts(s, "NBD_FLAG_SEND_FUA\n");
1459        if (flags & NBD_FLAG_SEND_TRIM)
1460                seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
1461
1462        return 0;
1463}
1464
1465static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
1466{
1467        return single_open(file, nbd_dbg_flags_show, inode->i_private);
1468}
1469
1470static const struct file_operations nbd_dbg_flags_ops = {
1471        .open = nbd_dbg_flags_open,
1472        .read = seq_read,
1473        .llseek = seq_lseek,
1474        .release = single_release,
1475};
1476
1477static int nbd_dev_dbg_init(struct nbd_device *nbd)
1478{
1479        struct dentry *dir;
1480        struct nbd_config *config = nbd->config;
1481
1482        if (!nbd_dbg_dir)
1483                return -EIO;
1484
1485        dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
1486        if (!dir) {
1487                dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
1488                        nbd_name(nbd));
1489                return -EIO;
1490        }
1491        config->dbg_dir = dir;
1492
1493        debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
1494        debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize);
1495        debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
1496        debugfs_create_u64("blocksize", 0444, dir, &config->blksize);
1497        debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
1498
1499        return 0;
1500}
1501
1502static void nbd_dev_dbg_close(struct nbd_device *nbd)
1503{
1504        debugfs_remove_recursive(nbd->config->dbg_dir);
1505}
1506
1507static int nbd_dbg_init(void)
1508{
1509        struct dentry *dbg_dir;
1510
1511        dbg_dir = debugfs_create_dir("nbd", NULL);
1512        if (!dbg_dir)
1513                return -EIO;
1514
1515        nbd_dbg_dir = dbg_dir;
1516
1517        return 0;
1518}
1519
1520static void nbd_dbg_close(void)
1521{
1522        debugfs_remove_recursive(nbd_dbg_dir);
1523}
1524
1525#else  /* IS_ENABLED(CONFIG_DEBUG_FS) */
1526
1527static int nbd_dev_dbg_init(struct nbd_device *nbd)
1528{
1529        return 0;
1530}
1531
1532static void nbd_dev_dbg_close(struct nbd_device *nbd)
1533{
1534}
1535
1536static int nbd_dbg_init(void)
1537{
1538        return 0;
1539}
1540
1541static void nbd_dbg_close(void)
1542{
1543}
1544
1545#endif
1546
1547static int nbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
1548                            unsigned int hctx_idx, unsigned int numa_node)
1549{
1550        struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
1551        cmd->nbd = set->driver_data;
1552        cmd->flags = 0;
1553        mutex_init(&cmd->lock);
1554        return 0;
1555}
1556
1557static const struct blk_mq_ops nbd_mq_ops = {
1558        .queue_rq       = nbd_queue_rq,
1559        .complete       = nbd_complete_rq,
1560        .init_request   = nbd_init_request,
1561        .timeout        = nbd_xmit_timeout,
1562};
1563
1564static int nbd_dev_add(int index)
1565{
1566        struct nbd_device *nbd;
1567        struct gendisk *disk;
1568        struct request_queue *q;
1569        int err = -ENOMEM;
1570
1571        nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL);
1572        if (!nbd)
1573                goto out;
1574
1575        disk = alloc_disk(1 << part_shift);
1576        if (!disk)
1577                goto out_free_nbd;
1578
1579        if (index >= 0) {
1580                err = idr_alloc(&nbd_index_idr, nbd, index, index + 1,
1581                                GFP_KERNEL);
1582                if (err == -ENOSPC)
1583                        err = -EEXIST;
1584        } else {
1585                err = idr_alloc(&nbd_index_idr, nbd, 0, 0, GFP_KERNEL);
1586                if (err >= 0)
1587                        index = err;
1588        }
1589        if (err < 0)
1590                goto out_free_disk;
1591
1592        nbd->index = index;
1593        nbd->disk = disk;
1594        nbd->tag_set.ops = &nbd_mq_ops;
1595        nbd->tag_set.nr_hw_queues = 1;
1596        nbd->tag_set.queue_depth = 128;
1597        nbd->tag_set.numa_node = NUMA_NO_NODE;
1598        nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
1599        nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
1600                BLK_MQ_F_BLOCKING;
1601        nbd->tag_set.driver_data = nbd;
1602
1603        err = blk_mq_alloc_tag_set(&nbd->tag_set);
1604        if (err)
1605                goto out_free_idr;
1606
1607        q = blk_mq_init_queue(&nbd->tag_set);
1608        if (IS_ERR(q)) {
1609                err = PTR_ERR(q);
1610                goto out_free_tags;
1611        }
1612        disk->queue = q;
1613
1614        /*
1615         * Tell the block layer that we are not a rotational device
1616         */
1617        blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
1618        blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
1619        disk->queue->limits.discard_granularity = 0;
1620        disk->queue->limits.discard_alignment = 0;
1621        blk_queue_max_discard_sectors(disk->queue, 0);
1622        blk_queue_max_segment_size(disk->queue, UINT_MAX);
1623        blk_queue_max_segments(disk->queue, USHRT_MAX);
1624        blk_queue_max_hw_sectors(disk->queue, 65536);
1625        disk->queue->limits.max_sectors = 256;
1626
1627        mutex_init(&nbd->config_lock);
1628        refcount_set(&nbd->config_refs, 0);
1629        refcount_set(&nbd->refs, 1);
1630        INIT_LIST_HEAD(&nbd->list);
1631        disk->major = NBD_MAJOR;
1632        disk->first_minor = index << part_shift;
1633        disk->fops = &nbd_fops;
1634        disk->private_data = nbd;
1635        sprintf(disk->disk_name, "nbd%d", index);
1636        add_disk(disk);
1637        nbd_total_devices++;
1638        return index;
1639
1640out_free_tags:
1641        blk_mq_free_tag_set(&nbd->tag_set);
1642out_free_idr:
1643        idr_remove(&nbd_index_idr, index);
1644out_free_disk:
1645        put_disk(disk);
1646out_free_nbd:
1647        kfree(nbd);
1648out:
1649        return err;
1650}
1651
1652static int find_free_cb(int id, void *ptr, void *data)
1653{
1654        struct nbd_device *nbd = ptr;
1655        struct nbd_device **found = data;
1656
1657        if (!refcount_read(&nbd->config_refs)) {
1658                *found = nbd;
1659                return 1;
1660        }
1661        return 0;
1662}
1663
1664/* Netlink interface. */
1665static struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = {
1666        [NBD_ATTR_INDEX]                =       { .type = NLA_U32 },
1667        [NBD_ATTR_SIZE_BYTES]           =       { .type = NLA_U64 },
1668        [NBD_ATTR_BLOCK_SIZE_BYTES]     =       { .type = NLA_U64 },
1669        [NBD_ATTR_TIMEOUT]              =       { .type = NLA_U64 },
1670        [NBD_ATTR_SERVER_FLAGS]         =       { .type = NLA_U64 },
1671        [NBD_ATTR_CLIENT_FLAGS]         =       { .type = NLA_U64 },
1672        [NBD_ATTR_SOCKETS]              =       { .type = NLA_NESTED},
1673        [NBD_ATTR_DEAD_CONN_TIMEOUT]    =       { .type = NLA_U64 },
1674        [NBD_ATTR_DEVICE_LIST]          =       { .type = NLA_NESTED},
1675};
1676
1677static struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = {
1678        [NBD_SOCK_FD]                   =       { .type = NLA_U32 },
1679};
1680
1681/* We don't use this right now since we don't parse the incoming list, but we
1682 * still want it here so userspace knows what to expect.
1683 */
1684static struct nla_policy __attribute__((unused))
1685nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = {
1686        [NBD_DEVICE_INDEX]              =       { .type = NLA_U32 },
1687        [NBD_DEVICE_CONNECTED]          =       { .type = NLA_U8 },
1688};
1689
1690static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
1691{
1692        struct nbd_device *nbd = NULL;
1693        struct nbd_config *config;
1694        int index = -1;
1695        int ret;
1696        bool put_dev = false;
1697
1698        if (!netlink_capable(skb, CAP_SYS_ADMIN))
1699                return -EPERM;
1700
1701        if (info->attrs[NBD_ATTR_INDEX])
1702                index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
1703        if (!info->attrs[NBD_ATTR_SOCKETS]) {
1704                printk(KERN_ERR "nbd: must specify at least one socket\n");
1705                return -EINVAL;
1706        }
1707        if (!info->attrs[NBD_ATTR_SIZE_BYTES]) {
1708                printk(KERN_ERR "nbd: must specify a size in bytes for the device\n");
1709                return -EINVAL;
1710        }
1711again:
1712        mutex_lock(&nbd_index_mutex);
1713        if (index == -1) {
1714                ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd);
1715                if (ret == 0) {
1716                        int new_index;
1717                        new_index = nbd_dev_add(-1);
1718                        if (new_index < 0) {
1719                                mutex_unlock(&nbd_index_mutex);
1720                                printk(KERN_ERR "nbd: failed to add new device\n");
1721                                return new_index;
1722                        }
1723                        nbd = idr_find(&nbd_index_idr, new_index);
1724                }
1725        } else {
1726                nbd = idr_find(&nbd_index_idr, index);
1727                if (!nbd) {
1728                        ret = nbd_dev_add(index);
1729                        if (ret < 0) {
1730                                mutex_unlock(&nbd_index_mutex);
1731                                printk(KERN_ERR "nbd: failed to add new device\n");
1732                                return ret;
1733                        }
1734                        nbd = idr_find(&nbd_index_idr, index);
1735                }
1736        }
1737        if (!nbd) {
1738                printk(KERN_ERR "nbd: couldn't find device at index %d\n",
1739                       index);
1740                mutex_unlock(&nbd_index_mutex);
1741                return -EINVAL;
1742        }
1743        if (!refcount_inc_not_zero(&nbd->refs)) {
1744                mutex_unlock(&nbd_index_mutex);
1745                if (index == -1)
1746                        goto again;
1747                printk(KERN_ERR "nbd: device at index %d is going down\n",
1748                       index);
1749                return -EINVAL;
1750        }
1751        mutex_unlock(&nbd_index_mutex);
1752
1753        mutex_lock(&nbd->config_lock);
1754        if (refcount_read(&nbd->config_refs)) {
1755                mutex_unlock(&nbd->config_lock);
1756                nbd_put(nbd);
1757                if (index == -1)
1758                        goto again;
1759                printk(KERN_ERR "nbd: nbd%d already in use\n", index);
1760                return -EBUSY;
1761        }
1762        if (WARN_ON(nbd->config)) {
1763                mutex_unlock(&nbd->config_lock);
1764                nbd_put(nbd);
1765                return -EINVAL;
1766        }
1767        config = nbd->config = nbd_alloc_config();
1768        if (!nbd->config) {
1769                mutex_unlock(&nbd->config_lock);
1770                nbd_put(nbd);
1771                printk(KERN_ERR "nbd: couldn't allocate config\n");
1772                return -ENOMEM;
1773        }
1774        refcount_set(&nbd->config_refs, 1);
1775        set_bit(NBD_BOUND, &config->runtime_flags);
1776
1777        if (info->attrs[NBD_ATTR_SIZE_BYTES]) {
1778                u64 bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]);
1779                nbd_size_set(nbd, config->blksize,
1780                             div64_u64(bytes, config->blksize));
1781        }
1782        if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) {
1783                u64 bsize =
1784                        nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
1785                nbd_size_set(nbd, bsize, div64_u64(config->bytesize, bsize));
1786        }
1787        if (info->attrs[NBD_ATTR_TIMEOUT]) {
1788                u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
1789                nbd->tag_set.timeout = timeout * HZ;
1790                blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
1791        }
1792        if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
1793                config->dead_conn_timeout =
1794                        nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
1795                config->dead_conn_timeout *= HZ;
1796        }
1797        if (info->attrs[NBD_ATTR_SERVER_FLAGS])
1798                config->flags =
1799                        nla_get_u64(info->attrs[NBD_ATTR_SERVER_FLAGS]);
1800        if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
1801                u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
1802                if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
1803                        set_bit(NBD_DESTROY_ON_DISCONNECT,
1804                                &config->runtime_flags);
1805                        put_dev = true;
1806                }
1807                if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
1808                        set_bit(NBD_DISCONNECT_ON_CLOSE,
1809                                &config->runtime_flags);
1810                }
1811        }
1812
1813        if (info->attrs[NBD_ATTR_SOCKETS]) {
1814                struct nlattr *attr;
1815                int rem, fd;
1816
1817                nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
1818                                    rem) {
1819                        struct nlattr *socks[NBD_SOCK_MAX+1];
1820
1821                        if (nla_type(attr) != NBD_SOCK_ITEM) {
1822                                printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
1823                                ret = -EINVAL;
1824                                goto out;
1825                        }
1826                        ret = nla_parse_nested(socks, NBD_SOCK_MAX, attr,
1827                                               nbd_sock_policy, info->extack);
1828                        if (ret != 0) {
1829                                printk(KERN_ERR "nbd: error processing sock list\n");
1830                                ret = -EINVAL;
1831                                goto out;
1832                        }
1833                        if (!socks[NBD_SOCK_FD])
1834                                continue;
1835                        fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
1836                        ret = nbd_add_socket(nbd, fd, true);
1837                        if (ret)
1838                                goto out;
1839                }
1840        }
1841        ret = nbd_start_device(nbd);
1842out:
1843        mutex_unlock(&nbd->config_lock);
1844        if (!ret) {
1845                set_bit(NBD_HAS_CONFIG_REF, &config->runtime_flags);
1846                refcount_inc(&nbd->config_refs);
1847                nbd_connect_reply(info, nbd->index);
1848        }
1849        nbd_config_put(nbd);
1850        if (put_dev)
1851                nbd_put(nbd);
1852        return ret;
1853}
1854
1855static void nbd_disconnect_and_put(struct nbd_device *nbd)
1856{
1857        mutex_lock(&nbd->config_lock);
1858        nbd_disconnect(nbd);
1859        nbd_clear_sock(nbd);
1860        mutex_unlock(&nbd->config_lock);
1861        if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
1862                               &nbd->config->runtime_flags))
1863                nbd_config_put(nbd);
1864}
1865
1866static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
1867{
1868        struct nbd_device *nbd;
1869        int index;
1870
1871        if (!netlink_capable(skb, CAP_SYS_ADMIN))
1872                return -EPERM;
1873
1874        if (!info->attrs[NBD_ATTR_INDEX]) {
1875                printk(KERN_ERR "nbd: must specify an index to disconnect\n");
1876                return -EINVAL;
1877        }
1878        index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
1879        mutex_lock(&nbd_index_mutex);
1880        nbd = idr_find(&nbd_index_idr, index);
1881        if (!nbd) {
1882                mutex_unlock(&nbd_index_mutex);
1883                printk(KERN_ERR "nbd: couldn't find device at index %d\n",
1884                       index);
1885                return -EINVAL;
1886        }
1887        if (!refcount_inc_not_zero(&nbd->refs)) {
1888                mutex_unlock(&nbd_index_mutex);
1889                printk(KERN_ERR "nbd: device at index %d is going down\n",
1890                       index);
1891                return -EINVAL;
1892        }
1893        mutex_unlock(&nbd_index_mutex);
1894        if (!refcount_inc_not_zero(&nbd->config_refs)) {
1895                nbd_put(nbd);
1896                return 0;
1897        }
1898        nbd_disconnect_and_put(nbd);
1899        nbd_config_put(nbd);
1900        nbd_put(nbd);
1901        return 0;
1902}
1903
1904static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
1905{
1906        struct nbd_device *nbd = NULL;
1907        struct nbd_config *config;
1908        int index;
1909        int ret = 0;
1910        bool put_dev = false;
1911
1912        if (!netlink_capable(skb, CAP_SYS_ADMIN))
1913                return -EPERM;
1914
1915        if (!info->attrs[NBD_ATTR_INDEX]) {
1916                printk(KERN_ERR "nbd: must specify a device to reconfigure\n");
1917                return -EINVAL;
1918        }
1919        index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
1920        mutex_lock(&nbd_index_mutex);
1921        nbd = idr_find(&nbd_index_idr, index);
1922        if (!nbd) {
1923                mutex_unlock(&nbd_index_mutex);
1924                printk(KERN_ERR "nbd: couldn't find a device at index %d\n",
1925                       index);
1926                return -EINVAL;
1927        }
1928        if (!refcount_inc_not_zero(&nbd->refs)) {
1929                mutex_unlock(&nbd_index_mutex);
1930                printk(KERN_ERR "nbd: device at index %d is going down\n",
1931                       index);
1932                return -EINVAL;
1933        }
1934        mutex_unlock(&nbd_index_mutex);
1935
1936        if (!refcount_inc_not_zero(&nbd->config_refs)) {
1937                dev_err(nbd_to_dev(nbd),
1938                        "not configured, cannot reconfigure\n");
1939                nbd_put(nbd);
1940                return -EINVAL;
1941        }
1942
1943        mutex_lock(&nbd->config_lock);
1944        config = nbd->config;
1945        if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
1946            !nbd->task_recv) {
1947                dev_err(nbd_to_dev(nbd),
1948                        "not configured, cannot reconfigure\n");
1949                ret = -EINVAL;
1950                goto out;
1951        }
1952
1953        if (info->attrs[NBD_ATTR_TIMEOUT]) {
1954                u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
1955                nbd->tag_set.timeout = timeout * HZ;
1956                blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
1957        }
1958        if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
1959                config->dead_conn_timeout =
1960                        nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
1961                config->dead_conn_timeout *= HZ;
1962        }
1963        if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
1964                u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
1965                if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
1966                        if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
1967                                              &config->runtime_flags))
1968                                put_dev = true;
1969                } else {
1970                        if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
1971                                               &config->runtime_flags))
1972                                refcount_inc(&nbd->refs);
1973                }
1974
1975                if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
1976                        set_bit(NBD_DISCONNECT_ON_CLOSE,
1977                                        &config->runtime_flags);
1978                } else {
1979                        clear_bit(NBD_DISCONNECT_ON_CLOSE,
1980                                        &config->runtime_flags);
1981                }
1982        }
1983
1984        if (info->attrs[NBD_ATTR_SOCKETS]) {
1985                struct nlattr *attr;
1986                int rem, fd;
1987
1988                nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
1989                                    rem) {
1990                        struct nlattr *socks[NBD_SOCK_MAX+1];
1991
1992                        if (nla_type(attr) != NBD_SOCK_ITEM) {
1993                                printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
1994                                ret = -EINVAL;
1995                                goto out;
1996                        }
1997                        ret = nla_parse_nested(socks, NBD_SOCK_MAX, attr,
1998                                               nbd_sock_policy, info->extack);
1999                        if (ret != 0) {
2000                                printk(KERN_ERR "nbd: error processing sock list\n");
2001                                ret = -EINVAL;
2002                                goto out;
2003                        }
2004                        if (!socks[NBD_SOCK_FD])
2005                                continue;
2006                        fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
2007                        ret = nbd_reconnect_socket(nbd, fd);
2008                        if (ret) {
2009                                if (ret == -ENOSPC)
2010                                        ret = 0;
2011                                goto out;
2012                        }
2013                        dev_info(nbd_to_dev(nbd), "reconnected socket\n");
2014                }
2015        }
2016out:
2017        mutex_unlock(&nbd->config_lock);
2018        nbd_config_put(nbd);
2019        nbd_put(nbd);
2020        if (put_dev)
2021                nbd_put(nbd);
2022        return ret;
2023}
2024
2025static const struct genl_ops nbd_connect_genl_ops[] = {
2026        {
2027                .cmd    = NBD_CMD_CONNECT,
2028                .policy = nbd_attr_policy,
2029                .doit   = nbd_genl_connect,
2030        },
2031        {
2032                .cmd    = NBD_CMD_DISCONNECT,
2033                .policy = nbd_attr_policy,
2034                .doit   = nbd_genl_disconnect,
2035        },
2036        {
2037                .cmd    = NBD_CMD_RECONFIGURE,
2038                .policy = nbd_attr_policy,
2039                .doit   = nbd_genl_reconfigure,
2040        },
2041        {
2042                .cmd    = NBD_CMD_STATUS,
2043                .policy = nbd_attr_policy,
2044                .doit   = nbd_genl_status,
2045        },
2046};
2047
2048static const struct genl_multicast_group nbd_mcast_grps[] = {
2049        { .name = NBD_GENL_MCAST_GROUP_NAME, },
2050};
2051
2052static struct genl_family nbd_genl_family __ro_after_init = {
2053        .hdrsize        = 0,
2054        .name           = NBD_GENL_FAMILY_NAME,
2055        .version        = NBD_GENL_VERSION,
2056        .module         = THIS_MODULE,
2057        .ops            = nbd_connect_genl_ops,
2058        .n_ops          = ARRAY_SIZE(nbd_connect_genl_ops),
2059        .maxattr        = NBD_ATTR_MAX,
2060        .mcgrps         = nbd_mcast_grps,
2061        .n_mcgrps       = ARRAY_SIZE(nbd_mcast_grps),
2062};
2063
2064static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
2065{
2066        struct nlattr *dev_opt;
2067        u8 connected = 0;
2068        int ret;
2069
2070        /* This is a little racey, but for status it's ok.  The
2071         * reason we don't take a ref here is because we can't
2072         * take a ref in the index == -1 case as we would need
2073         * to put under the nbd_index_mutex, which could
2074         * deadlock if we are configured to remove ourselves
2075         * once we're disconnected.
2076         */
2077        if (refcount_read(&nbd->config_refs))
2078                connected = 1;
2079        dev_opt = nla_nest_start(reply, NBD_DEVICE_ITEM);
2080        if (!dev_opt)
2081                return -EMSGSIZE;
2082        ret = nla_put_u32(reply, NBD_DEVICE_INDEX, nbd->index);
2083        if (ret)
2084                return -EMSGSIZE;
2085        ret = nla_put_u8(reply, NBD_DEVICE_CONNECTED,
2086                         connected);
2087        if (ret)
2088                return -EMSGSIZE;
2089        nla_nest_end(reply, dev_opt);
2090        return 0;
2091}
2092
2093static int status_cb(int id, void *ptr, void *data)
2094{
2095        struct nbd_device *nbd = ptr;
2096        return populate_nbd_status(nbd, (struct sk_buff *)data);
2097}
2098
2099static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
2100{
2101        struct nlattr *dev_list;
2102        struct sk_buff *reply;
2103        void *reply_head;
2104        size_t msg_size;
2105        int index = -1;
2106        int ret = -ENOMEM;
2107
2108        if (info->attrs[NBD_ATTR_INDEX])
2109                index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
2110
2111        mutex_lock(&nbd_index_mutex);
2112
2113        msg_size = nla_total_size(nla_attr_size(sizeof(u32)) +
2114                                  nla_attr_size(sizeof(u8)));
2115        msg_size *= (index == -1) ? nbd_total_devices : 1;
2116
2117        reply = genlmsg_new(msg_size, GFP_KERNEL);
2118        if (!reply)
2119                goto out;
2120        reply_head = genlmsg_put_reply(reply, info, &nbd_genl_family, 0,
2121                                       NBD_CMD_STATUS);
2122        if (!reply_head) {
2123                nlmsg_free(reply);
2124                goto out;
2125        }
2126
2127        dev_list = nla_nest_start(reply, NBD_ATTR_DEVICE_LIST);
2128        if (index == -1) {
2129                ret = idr_for_each(&nbd_index_idr, &status_cb, reply);
2130                if (ret) {
2131                        nlmsg_free(reply);
2132                        goto out;
2133                }
2134        } else {
2135                struct nbd_device *nbd;
2136                nbd = idr_find(&nbd_index_idr, index);
2137                if (nbd) {
2138                        ret = populate_nbd_status(nbd, reply);
2139                        if (ret) {
2140                                nlmsg_free(reply);
2141                                goto out;
2142                        }
2143                }
2144        }
2145        nla_nest_end(reply, dev_list);
2146        genlmsg_end(reply, reply_head);
2147        genlmsg_reply(reply, info);
2148        ret = 0;
2149out:
2150        mutex_unlock(&nbd_index_mutex);
2151        return ret;
2152}
2153
2154static void nbd_connect_reply(struct genl_info *info, int index)
2155{
2156        struct sk_buff *skb;
2157        void *msg_head;
2158        int ret;
2159
2160        skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
2161        if (!skb)
2162                return;
2163        msg_head = genlmsg_put_reply(skb, info, &nbd_genl_family, 0,
2164                                     NBD_CMD_CONNECT);
2165        if (!msg_head) {
2166                nlmsg_free(skb);
2167                return;
2168        }
2169        ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
2170        if (ret) {
2171                nlmsg_free(skb);
2172                return;
2173        }
2174        genlmsg_end(skb, msg_head);
2175        genlmsg_reply(skb, info);
2176}
2177
2178static void nbd_mcast_index(int index)
2179{
2180        struct sk_buff *skb;
2181        void *msg_head;
2182        int ret;
2183
2184        skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
2185        if (!skb)
2186                return;
2187        msg_head = genlmsg_put(skb, 0, 0, &nbd_genl_family, 0,
2188                                     NBD_CMD_LINK_DEAD);
2189        if (!msg_head) {
2190                nlmsg_free(skb);
2191                return;
2192        }
2193        ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
2194        if (ret) {
2195                nlmsg_free(skb);
2196                return;
2197        }
2198        genlmsg_end(skb, msg_head);
2199        genlmsg_multicast(&nbd_genl_family, skb, 0, 0, GFP_KERNEL);
2200}
2201
2202static void nbd_dead_link_work(struct work_struct *work)
2203{
2204        struct link_dead_args *args = container_of(work, struct link_dead_args,
2205                                                   work);
2206        nbd_mcast_index(args->index);
2207        kfree(args);
2208}
2209
2210static int __init nbd_init(void)
2211{
2212        int i;
2213
2214        BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
2215
2216        if (max_part < 0) {
2217                printk(KERN_ERR "nbd: max_part must be >= 0\n");
2218                return -EINVAL;
2219        }
2220
2221        part_shift = 0;
2222        if (max_part > 0) {
2223                part_shift = fls(max_part);
2224
2225                /*
2226                 * Adjust max_part according to part_shift as it is exported
2227                 * to user space so that user can know the max number of
2228                 * partition kernel should be able to manage.
2229                 *
2230                 * Note that -1 is required because partition 0 is reserved
2231                 * for the whole disk.
2232                 */
2233                max_part = (1UL << part_shift) - 1;
2234        }
2235
2236        if ((1UL << part_shift) > DISK_MAX_PARTS)
2237                return -EINVAL;
2238
2239        if (nbds_max > 1UL << (MINORBITS - part_shift))
2240                return -EINVAL;
2241        recv_workqueue = alloc_workqueue("knbd-recv",
2242                                         WQ_MEM_RECLAIM | WQ_HIGHPRI |
2243                                         WQ_UNBOUND, 0);
2244        if (!recv_workqueue)
2245                return -ENOMEM;
2246
2247        if (register_blkdev(NBD_MAJOR, "nbd")) {
2248                destroy_workqueue(recv_workqueue);
2249                return -EIO;
2250        }
2251
2252        if (genl_register_family(&nbd_genl_family)) {
2253                unregister_blkdev(NBD_MAJOR, "nbd");
2254                destroy_workqueue(recv_workqueue);
2255                return -EINVAL;
2256        }
2257        nbd_dbg_init();
2258
2259        mutex_lock(&nbd_index_mutex);
2260        for (i = 0; i < nbds_max; i++)
2261                nbd_dev_add(i);
2262        mutex_unlock(&nbd_index_mutex);
2263        return 0;
2264}
2265
2266static int nbd_exit_cb(int id, void *ptr, void *data)
2267{
2268        struct list_head *list = (struct list_head *)data;
2269        struct nbd_device *nbd = ptr;
2270
2271        list_add_tail(&nbd->list, list);
2272        return 0;
2273}
2274
2275static void __exit nbd_cleanup(void)
2276{
2277        struct nbd_device *nbd;
2278        LIST_HEAD(del_list);
2279
2280        nbd_dbg_close();
2281
2282        mutex_lock(&nbd_index_mutex);
2283        idr_for_each(&nbd_index_idr, &nbd_exit_cb, &del_list);
2284        mutex_unlock(&nbd_index_mutex);
2285
2286        while (!list_empty(&del_list)) {
2287                nbd = list_first_entry(&del_list, struct nbd_device, list);
2288                list_del_init(&nbd->list);
2289                if (refcount_read(&nbd->refs) != 1)
2290                        printk(KERN_ERR "nbd: possibly leaking a device\n");
2291                nbd_put(nbd);
2292        }
2293
2294        idr_destroy(&nbd_index_idr);
2295        genl_unregister_family(&nbd_genl_family);
2296        destroy_workqueue(recv_workqueue);
2297        unregister_blkdev(NBD_MAJOR, "nbd");
2298}
2299
2300module_init(nbd_init);
2301module_exit(nbd_cleanup);
2302
2303MODULE_DESCRIPTION("Network Block Device");
2304MODULE_LICENSE("GPL");
2305
2306module_param(nbds_max, int, 0444);
2307MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
2308module_param(max_part, int, 0444);
2309MODULE_PARM_DESC(max_part, "number of partitions per device (default: 16)");
2310