linux/drivers/nvme/target/core.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Common code for the NVMe target.
   4 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
   5 */
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7#include <linux/module.h>
   8#include <linux/random.h>
   9#include <linux/rculist.h>
  10#include <linux/pci-p2pdma.h>
  11#include <linux/scatterlist.h>
  12
  13#define CREATE_TRACE_POINTS
  14#include "trace.h"
  15
  16#include "nvmet.h"
  17
  18struct workqueue_struct *buffered_io_wq;
  19static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
  20static DEFINE_IDA(cntlid_ida);
  21
  22/*
  23 * This read/write semaphore is used to synchronize access to configuration
  24 * information on a target system that will result in discovery log page
  25 * information change for at least one host.
  26 * The full list of resources to protected by this semaphore is:
  27 *
  28 *  - subsystems list
  29 *  - per-subsystem allowed hosts list
  30 *  - allow_any_host subsystem attribute
  31 *  - nvmet_genctr
  32 *  - the nvmet_transports array
  33 *
  34 * When updating any of those lists/structures write lock should be obtained,
  35 * while when reading (popolating discovery log page or checking host-subsystem
  36 * link) read lock is obtained to allow concurrent reads.
  37 */
  38DECLARE_RWSEM(nvmet_config_sem);
  39
  40u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
  41u64 nvmet_ana_chgcnt;
  42DECLARE_RWSEM(nvmet_ana_sem);
  43
  44inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno)
  45{
  46        u16 status;
  47
  48        switch (errno) {
  49        case 0:
  50                status = NVME_SC_SUCCESS;
  51                break;
  52        case -ENOSPC:
  53                req->error_loc = offsetof(struct nvme_rw_command, length);
  54                status = NVME_SC_CAP_EXCEEDED | NVME_SC_DNR;
  55                break;
  56        case -EREMOTEIO:
  57                req->error_loc = offsetof(struct nvme_rw_command, slba);
  58                status = NVME_SC_LBA_RANGE | NVME_SC_DNR;
  59                break;
  60        case -EOPNOTSUPP:
  61                req->error_loc = offsetof(struct nvme_common_command, opcode);
  62                switch (req->cmd->common.opcode) {
  63                case nvme_cmd_dsm:
  64                case nvme_cmd_write_zeroes:
  65                        status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR;
  66                        break;
  67                default:
  68                        status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
  69                }
  70                break;
  71        case -ENODATA:
  72                req->error_loc = offsetof(struct nvme_rw_command, nsid);
  73                status = NVME_SC_ACCESS_DENIED;
  74                break;
  75        case -EIO:
  76                fallthrough;
  77        default:
  78                req->error_loc = offsetof(struct nvme_common_command, opcode);
  79                status = NVME_SC_INTERNAL | NVME_SC_DNR;
  80        }
  81
  82        return status;
  83}
  84
  85static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
  86                const char *subsysnqn);
  87
  88u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
  89                size_t len)
  90{
  91        if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) {
  92                req->error_loc = offsetof(struct nvme_common_command, dptr);
  93                return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
  94        }
  95        return 0;
  96}
  97
  98u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len)
  99{
 100        if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) {
 101                req->error_loc = offsetof(struct nvme_common_command, dptr);
 102                return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
 103        }
 104        return 0;
 105}
 106
 107u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len)
 108{
 109        if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len) {
 110                req->error_loc = offsetof(struct nvme_common_command, dptr);
 111                return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
 112        }
 113        return 0;
 114}
 115
 116static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys)
 117{
 118        unsigned long nsid = 0;
 119        struct nvmet_ns *cur;
 120        unsigned long idx;
 121
 122        xa_for_each(&subsys->namespaces, idx, cur)
 123                nsid = cur->nsid;
 124
 125        return nsid;
 126}
 127
 128static u32 nvmet_async_event_result(struct nvmet_async_event *aen)
 129{
 130        return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16);
 131}
 132
 133static void nvmet_async_events_failall(struct nvmet_ctrl *ctrl)
 134{
 135        u16 status = NVME_SC_INTERNAL | NVME_SC_DNR;
 136        struct nvmet_req *req;
 137
 138        mutex_lock(&ctrl->lock);
 139        while (ctrl->nr_async_event_cmds) {
 140                req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
 141                mutex_unlock(&ctrl->lock);
 142                nvmet_req_complete(req, status);
 143                mutex_lock(&ctrl->lock);
 144        }
 145        mutex_unlock(&ctrl->lock);
 146}
 147
 148static void nvmet_async_events_process(struct nvmet_ctrl *ctrl)
 149{
 150        struct nvmet_async_event *aen;
 151        struct nvmet_req *req;
 152
 153        mutex_lock(&ctrl->lock);
 154        while (ctrl->nr_async_event_cmds && !list_empty(&ctrl->async_events)) {
 155                aen = list_first_entry(&ctrl->async_events,
 156                                       struct nvmet_async_event, entry);
 157                req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
 158                nvmet_set_result(req, nvmet_async_event_result(aen));
 159
 160                list_del(&aen->entry);
 161                kfree(aen);
 162
 163                mutex_unlock(&ctrl->lock);
 164                trace_nvmet_async_event(ctrl, req->cqe->result.u32);
 165                nvmet_req_complete(req, 0);
 166                mutex_lock(&ctrl->lock);
 167        }
 168        mutex_unlock(&ctrl->lock);
 169}
 170
 171static void nvmet_async_events_free(struct nvmet_ctrl *ctrl)
 172{
 173        struct nvmet_async_event *aen, *tmp;
 174
 175        mutex_lock(&ctrl->lock);
 176        list_for_each_entry_safe(aen, tmp, &ctrl->async_events, entry) {
 177                list_del(&aen->entry);
 178                kfree(aen);
 179        }
 180        mutex_unlock(&ctrl->lock);
 181}
 182
 183static void nvmet_async_event_work(struct work_struct *work)
 184{
 185        struct nvmet_ctrl *ctrl =
 186                container_of(work, struct nvmet_ctrl, async_event_work);
 187
 188        nvmet_async_events_process(ctrl);
 189}
 190
 191void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
 192                u8 event_info, u8 log_page)
 193{
 194        struct nvmet_async_event *aen;
 195
 196        aen = kmalloc(sizeof(*aen), GFP_KERNEL);
 197        if (!aen)
 198                return;
 199
 200        aen->event_type = event_type;
 201        aen->event_info = event_info;
 202        aen->log_page = log_page;
 203
 204        mutex_lock(&ctrl->lock);
 205        list_add_tail(&aen->entry, &ctrl->async_events);
 206        mutex_unlock(&ctrl->lock);
 207
 208        schedule_work(&ctrl->async_event_work);
 209}
 210
 211static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid)
 212{
 213        u32 i;
 214
 215        mutex_lock(&ctrl->lock);
 216        if (ctrl->nr_changed_ns > NVME_MAX_CHANGED_NAMESPACES)
 217                goto out_unlock;
 218
 219        for (i = 0; i < ctrl->nr_changed_ns; i++) {
 220                if (ctrl->changed_ns_list[i] == nsid)
 221                        goto out_unlock;
 222        }
 223
 224        if (ctrl->nr_changed_ns == NVME_MAX_CHANGED_NAMESPACES) {
 225                ctrl->changed_ns_list[0] = cpu_to_le32(0xffffffff);
 226                ctrl->nr_changed_ns = U32_MAX;
 227                goto out_unlock;
 228        }
 229
 230        ctrl->changed_ns_list[ctrl->nr_changed_ns++] = nsid;
 231out_unlock:
 232        mutex_unlock(&ctrl->lock);
 233}
 234
 235void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
 236{
 237        struct nvmet_ctrl *ctrl;
 238
 239        lockdep_assert_held(&subsys->lock);
 240
 241        list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
 242                nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid));
 243                if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_NS_ATTR))
 244                        continue;
 245                nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
 246                                NVME_AER_NOTICE_NS_CHANGED,
 247                                NVME_LOG_CHANGED_NS);
 248        }
 249}
 250
 251void nvmet_send_ana_event(struct nvmet_subsys *subsys,
 252                struct nvmet_port *port)
 253{
 254        struct nvmet_ctrl *ctrl;
 255
 256        mutex_lock(&subsys->lock);
 257        list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
 258                if (port && ctrl->port != port)
 259                        continue;
 260                if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_ANA_CHANGE))
 261                        continue;
 262                nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
 263                                NVME_AER_NOTICE_ANA, NVME_LOG_ANA);
 264        }
 265        mutex_unlock(&subsys->lock);
 266}
 267
 268void nvmet_port_send_ana_event(struct nvmet_port *port)
 269{
 270        struct nvmet_subsys_link *p;
 271
 272        down_read(&nvmet_config_sem);
 273        list_for_each_entry(p, &port->subsystems, entry)
 274                nvmet_send_ana_event(p->subsys, port);
 275        up_read(&nvmet_config_sem);
 276}
 277
 278int nvmet_register_transport(const struct nvmet_fabrics_ops *ops)
 279{
 280        int ret = 0;
 281
 282        down_write(&nvmet_config_sem);
 283        if (nvmet_transports[ops->type])
 284                ret = -EINVAL;
 285        else
 286                nvmet_transports[ops->type] = ops;
 287        up_write(&nvmet_config_sem);
 288
 289        return ret;
 290}
 291EXPORT_SYMBOL_GPL(nvmet_register_transport);
 292
 293void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops)
 294{
 295        down_write(&nvmet_config_sem);
 296        nvmet_transports[ops->type] = NULL;
 297        up_write(&nvmet_config_sem);
 298}
 299EXPORT_SYMBOL_GPL(nvmet_unregister_transport);
 300
 301void nvmet_port_del_ctrls(struct nvmet_port *port, struct nvmet_subsys *subsys)
 302{
 303        struct nvmet_ctrl *ctrl;
 304
 305        mutex_lock(&subsys->lock);
 306        list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
 307                if (ctrl->port == port)
 308                        ctrl->ops->delete_ctrl(ctrl);
 309        }
 310        mutex_unlock(&subsys->lock);
 311}
 312
 313int nvmet_enable_port(struct nvmet_port *port)
 314{
 315        const struct nvmet_fabrics_ops *ops;
 316        int ret;
 317
 318        lockdep_assert_held(&nvmet_config_sem);
 319
 320        ops = nvmet_transports[port->disc_addr.trtype];
 321        if (!ops) {
 322                up_write(&nvmet_config_sem);
 323                request_module("nvmet-transport-%d", port->disc_addr.trtype);
 324                down_write(&nvmet_config_sem);
 325                ops = nvmet_transports[port->disc_addr.trtype];
 326                if (!ops) {
 327                        pr_err("transport type %d not supported\n",
 328                                port->disc_addr.trtype);
 329                        return -EINVAL;
 330                }
 331        }
 332
 333        if (!try_module_get(ops->owner))
 334                return -EINVAL;
 335
 336        /*
 337         * If the user requested PI support and the transport isn't pi capable,
 338         * don't enable the port.
 339         */
 340        if (port->pi_enable && !(ops->flags & NVMF_METADATA_SUPPORTED)) {
 341                pr_err("T10-PI is not supported by transport type %d\n",
 342                       port->disc_addr.trtype);
 343                ret = -EINVAL;
 344                goto out_put;
 345        }
 346
 347        ret = ops->add_port(port);
 348        if (ret)
 349                goto out_put;
 350
 351        /* If the transport didn't set inline_data_size, then disable it. */
 352        if (port->inline_data_size < 0)
 353                port->inline_data_size = 0;
 354
 355        port->enabled = true;
 356        port->tr_ops = ops;
 357        return 0;
 358
 359out_put:
 360        module_put(ops->owner);
 361        return ret;
 362}
 363
 364void nvmet_disable_port(struct nvmet_port *port)
 365{
 366        const struct nvmet_fabrics_ops *ops;
 367
 368        lockdep_assert_held(&nvmet_config_sem);
 369
 370        port->enabled = false;
 371        port->tr_ops = NULL;
 372
 373        ops = nvmet_transports[port->disc_addr.trtype];
 374        ops->remove_port(port);
 375        module_put(ops->owner);
 376}
 377
 378static void nvmet_keep_alive_timer(struct work_struct *work)
 379{
 380        struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work),
 381                        struct nvmet_ctrl, ka_work);
 382        bool cmd_seen = ctrl->cmd_seen;
 383
 384        ctrl->cmd_seen = false;
 385        if (cmd_seen) {
 386                pr_debug("ctrl %d reschedule traffic based keep-alive timer\n",
 387                        ctrl->cntlid);
 388                schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
 389                return;
 390        }
 391
 392        pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n",
 393                ctrl->cntlid, ctrl->kato);
 394
 395        nvmet_ctrl_fatal_error(ctrl);
 396}
 397
 398void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl)
 399{
 400        if (unlikely(ctrl->kato == 0))
 401                return;
 402
 403        pr_debug("ctrl %d start keep-alive timer for %d secs\n",
 404                ctrl->cntlid, ctrl->kato);
 405
 406        INIT_DELAYED_WORK(&ctrl->ka_work, nvmet_keep_alive_timer);
 407        schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
 408}
 409
 410void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
 411{
 412        if (unlikely(ctrl->kato == 0))
 413                return;
 414
 415        pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid);
 416
 417        cancel_delayed_work_sync(&ctrl->ka_work);
 418}
 419
 420struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid)
 421{
 422        struct nvmet_ns *ns;
 423
 424        ns = xa_load(&ctrl->subsys->namespaces, le32_to_cpu(nsid));
 425        if (ns)
 426                percpu_ref_get(&ns->ref);
 427
 428        return ns;
 429}
 430
 431static void nvmet_destroy_namespace(struct percpu_ref *ref)
 432{
 433        struct nvmet_ns *ns = container_of(ref, struct nvmet_ns, ref);
 434
 435        complete(&ns->disable_done);
 436}
 437
 438void nvmet_put_namespace(struct nvmet_ns *ns)
 439{
 440        percpu_ref_put(&ns->ref);
 441}
 442
 443static void nvmet_ns_dev_disable(struct nvmet_ns *ns)
 444{
 445        nvmet_bdev_ns_disable(ns);
 446        nvmet_file_ns_disable(ns);
 447}
 448
 449static int nvmet_p2pmem_ns_enable(struct nvmet_ns *ns)
 450{
 451        int ret;
 452        struct pci_dev *p2p_dev;
 453
 454        if (!ns->use_p2pmem)
 455                return 0;
 456
 457        if (!ns->bdev) {
 458                pr_err("peer-to-peer DMA is not supported by non-block device namespaces\n");
 459                return -EINVAL;
 460        }
 461
 462        if (!blk_queue_pci_p2pdma(ns->bdev->bd_disk->queue)) {
 463                pr_err("peer-to-peer DMA is not supported by the driver of %s\n",
 464                       ns->device_path);
 465                return -EINVAL;
 466        }
 467
 468        if (ns->p2p_dev) {
 469                ret = pci_p2pdma_distance(ns->p2p_dev, nvmet_ns_dev(ns), true);
 470                if (ret < 0)
 471                        return -EINVAL;
 472        } else {
 473                /*
 474                 * Right now we just check that there is p2pmem available so
 475                 * we can report an error to the user right away if there
 476                 * is not. We'll find the actual device to use once we
 477                 * setup the controller when the port's device is available.
 478                 */
 479
 480                p2p_dev = pci_p2pmem_find(nvmet_ns_dev(ns));
 481                if (!p2p_dev) {
 482                        pr_err("no peer-to-peer memory is available for %s\n",
 483                               ns->device_path);
 484                        return -EINVAL;
 485                }
 486
 487                pci_dev_put(p2p_dev);
 488        }
 489
 490        return 0;
 491}
 492
 493/*
 494 * Note: ctrl->subsys->lock should be held when calling this function
 495 */
 496static void nvmet_p2pmem_ns_add_p2p(struct nvmet_ctrl *ctrl,
 497                                    struct nvmet_ns *ns)
 498{
 499        struct device *clients[2];
 500        struct pci_dev *p2p_dev;
 501        int ret;
 502
 503        if (!ctrl->p2p_client || !ns->use_p2pmem)
 504                return;
 505
 506        if (ns->p2p_dev) {
 507                ret = pci_p2pdma_distance(ns->p2p_dev, ctrl->p2p_client, true);
 508                if (ret < 0)
 509                        return;
 510
 511                p2p_dev = pci_dev_get(ns->p2p_dev);
 512        } else {
 513                clients[0] = ctrl->p2p_client;
 514                clients[1] = nvmet_ns_dev(ns);
 515
 516                p2p_dev = pci_p2pmem_find_many(clients, ARRAY_SIZE(clients));
 517                if (!p2p_dev) {
 518                        pr_err("no peer-to-peer memory is available that's supported by %s and %s\n",
 519                               dev_name(ctrl->p2p_client), ns->device_path);
 520                        return;
 521                }
 522        }
 523
 524        ret = radix_tree_insert(&ctrl->p2p_ns_map, ns->nsid, p2p_dev);
 525        if (ret < 0)
 526                pci_dev_put(p2p_dev);
 527
 528        pr_info("using p2pmem on %s for nsid %d\n", pci_name(p2p_dev),
 529                ns->nsid);
 530}
 531
 532void nvmet_ns_revalidate(struct nvmet_ns *ns)
 533{
 534        loff_t oldsize = ns->size;
 535
 536        if (ns->bdev)
 537                nvmet_bdev_ns_revalidate(ns);
 538        else
 539                nvmet_file_ns_revalidate(ns);
 540
 541        if (oldsize != ns->size)
 542                nvmet_ns_changed(ns->subsys, ns->nsid);
 543}
 544
 545int nvmet_ns_enable(struct nvmet_ns *ns)
 546{
 547        struct nvmet_subsys *subsys = ns->subsys;
 548        struct nvmet_ctrl *ctrl;
 549        int ret;
 550
 551        mutex_lock(&subsys->lock);
 552        ret = 0;
 553
 554        if (nvmet_passthru_ctrl(subsys)) {
 555                pr_info("cannot enable both passthru and regular namespaces for a single subsystem");
 556                goto out_unlock;
 557        }
 558
 559        if (ns->enabled)
 560                goto out_unlock;
 561
 562        ret = -EMFILE;
 563        if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES)
 564                goto out_unlock;
 565
 566        ret = nvmet_bdev_ns_enable(ns);
 567        if (ret == -ENOTBLK)
 568                ret = nvmet_file_ns_enable(ns);
 569        if (ret)
 570                goto out_unlock;
 571
 572        ret = nvmet_p2pmem_ns_enable(ns);
 573        if (ret)
 574                goto out_dev_disable;
 575
 576        list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
 577                nvmet_p2pmem_ns_add_p2p(ctrl, ns);
 578
 579        ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace,
 580                                0, GFP_KERNEL);
 581        if (ret)
 582                goto out_dev_put;
 583
 584        if (ns->nsid > subsys->max_nsid)
 585                subsys->max_nsid = ns->nsid;
 586
 587        ret = xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL);
 588        if (ret)
 589                goto out_restore_subsys_maxnsid;
 590
 591        subsys->nr_namespaces++;
 592
 593        nvmet_ns_changed(subsys, ns->nsid);
 594        ns->enabled = true;
 595        ret = 0;
 596out_unlock:
 597        mutex_unlock(&subsys->lock);
 598        return ret;
 599
 600out_restore_subsys_maxnsid:
 601        subsys->max_nsid = nvmet_max_nsid(subsys);
 602        percpu_ref_exit(&ns->ref);
 603out_dev_put:
 604        list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
 605                pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
 606out_dev_disable:
 607        nvmet_ns_dev_disable(ns);
 608        goto out_unlock;
 609}
 610
 611void nvmet_ns_disable(struct nvmet_ns *ns)
 612{
 613        struct nvmet_subsys *subsys = ns->subsys;
 614        struct nvmet_ctrl *ctrl;
 615
 616        mutex_lock(&subsys->lock);
 617        if (!ns->enabled)
 618                goto out_unlock;
 619
 620        ns->enabled = false;
 621        xa_erase(&ns->subsys->namespaces, ns->nsid);
 622        if (ns->nsid == subsys->max_nsid)
 623                subsys->max_nsid = nvmet_max_nsid(subsys);
 624
 625        list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
 626                pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
 627
 628        mutex_unlock(&subsys->lock);
 629
 630        /*
 631         * Now that we removed the namespaces from the lookup list, we
 632         * can kill the per_cpu ref and wait for any remaining references
 633         * to be dropped, as well as a RCU grace period for anyone only
 634         * using the namepace under rcu_read_lock().  Note that we can't
 635         * use call_rcu here as we need to ensure the namespaces have
 636         * been fully destroyed before unloading the module.
 637         */
 638        percpu_ref_kill(&ns->ref);
 639        synchronize_rcu();
 640        wait_for_completion(&ns->disable_done);
 641        percpu_ref_exit(&ns->ref);
 642
 643        mutex_lock(&subsys->lock);
 644
 645        subsys->nr_namespaces--;
 646        nvmet_ns_changed(subsys, ns->nsid);
 647        nvmet_ns_dev_disable(ns);
 648out_unlock:
 649        mutex_unlock(&subsys->lock);
 650}
 651
 652void nvmet_ns_free(struct nvmet_ns *ns)
 653{
 654        nvmet_ns_disable(ns);
 655
 656        down_write(&nvmet_ana_sem);
 657        nvmet_ana_group_enabled[ns->anagrpid]--;
 658        up_write(&nvmet_ana_sem);
 659
 660        kfree(ns->device_path);
 661        kfree(ns);
 662}
 663
 664struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
 665{
 666        struct nvmet_ns *ns;
 667
 668        ns = kzalloc(sizeof(*ns), GFP_KERNEL);
 669        if (!ns)
 670                return NULL;
 671
 672        init_completion(&ns->disable_done);
 673
 674        ns->nsid = nsid;
 675        ns->subsys = subsys;
 676
 677        down_write(&nvmet_ana_sem);
 678        ns->anagrpid = NVMET_DEFAULT_ANA_GRPID;
 679        nvmet_ana_group_enabled[ns->anagrpid]++;
 680        up_write(&nvmet_ana_sem);
 681
 682        uuid_gen(&ns->uuid);
 683        ns->buffered_io = false;
 684
 685        return ns;
 686}
 687
 688static void nvmet_update_sq_head(struct nvmet_req *req)
 689{
 690        if (req->sq->size) {
 691                u32 old_sqhd, new_sqhd;
 692
 693                do {
 694                        old_sqhd = req->sq->sqhd;
 695                        new_sqhd = (old_sqhd + 1) % req->sq->size;
 696                } while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) !=
 697                                        old_sqhd);
 698        }
 699        req->cqe->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF);
 700}
 701
 702static void nvmet_set_error(struct nvmet_req *req, u16 status)
 703{
 704        struct nvmet_ctrl *ctrl = req->sq->ctrl;
 705        struct nvme_error_slot *new_error_slot;
 706        unsigned long flags;
 707
 708        req->cqe->status = cpu_to_le16(status << 1);
 709
 710        if (!ctrl || req->error_loc == NVMET_NO_ERROR_LOC)
 711                return;
 712
 713        spin_lock_irqsave(&ctrl->error_lock, flags);
 714        ctrl->err_counter++;
 715        new_error_slot =
 716                &ctrl->slots[ctrl->err_counter % NVMET_ERROR_LOG_SLOTS];
 717
 718        new_error_slot->error_count = cpu_to_le64(ctrl->err_counter);
 719        new_error_slot->sqid = cpu_to_le16(req->sq->qid);
 720        new_error_slot->cmdid = cpu_to_le16(req->cmd->common.command_id);
 721        new_error_slot->status_field = cpu_to_le16(status << 1);
 722        new_error_slot->param_error_location = cpu_to_le16(req->error_loc);
 723        new_error_slot->lba = cpu_to_le64(req->error_slba);
 724        new_error_slot->nsid = req->cmd->common.nsid;
 725        spin_unlock_irqrestore(&ctrl->error_lock, flags);
 726
 727        /* set the more bit for this request */
 728        req->cqe->status |= cpu_to_le16(1 << 14);
 729}
 730
 731static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
 732{
 733        if (!req->sq->sqhd_disabled)
 734                nvmet_update_sq_head(req);
 735        req->cqe->sq_id = cpu_to_le16(req->sq->qid);
 736        req->cqe->command_id = req->cmd->common.command_id;
 737
 738        if (unlikely(status))
 739                nvmet_set_error(req, status);
 740
 741        trace_nvmet_req_complete(req);
 742
 743        if (req->ns)
 744                nvmet_put_namespace(req->ns);
 745        req->ops->queue_response(req);
 746}
 747
 748void nvmet_req_complete(struct nvmet_req *req, u16 status)
 749{
 750        __nvmet_req_complete(req, status);
 751        percpu_ref_put(&req->sq->ref);
 752}
 753EXPORT_SYMBOL_GPL(nvmet_req_complete);
 754
 755void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq,
 756                u16 qid, u16 size)
 757{
 758        cq->qid = qid;
 759        cq->size = size;
 760}
 761
 762void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
 763                u16 qid, u16 size)
 764{
 765        sq->sqhd = 0;
 766        sq->qid = qid;
 767        sq->size = size;
 768
 769        ctrl->sqs[qid] = sq;
 770}
 771
 772static void nvmet_confirm_sq(struct percpu_ref *ref)
 773{
 774        struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref);
 775
 776        complete(&sq->confirm_done);
 777}
 778
 779void nvmet_sq_destroy(struct nvmet_sq *sq)
 780{
 781        struct nvmet_ctrl *ctrl = sq->ctrl;
 782
 783        /*
 784         * If this is the admin queue, complete all AERs so that our
 785         * queue doesn't have outstanding requests on it.
 786         */
 787        if (ctrl && ctrl->sqs && ctrl->sqs[0] == sq)
 788                nvmet_async_events_failall(ctrl);
 789        percpu_ref_kill_and_confirm(&sq->ref, nvmet_confirm_sq);
 790        wait_for_completion(&sq->confirm_done);
 791        wait_for_completion(&sq->free_done);
 792        percpu_ref_exit(&sq->ref);
 793
 794        if (ctrl) {
 795                nvmet_ctrl_put(ctrl);
 796                sq->ctrl = NULL; /* allows reusing the queue later */
 797        }
 798}
 799EXPORT_SYMBOL_GPL(nvmet_sq_destroy);
 800
 801static void nvmet_sq_free(struct percpu_ref *ref)
 802{
 803        struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref);
 804
 805        complete(&sq->free_done);
 806}
 807
 808int nvmet_sq_init(struct nvmet_sq *sq)
 809{
 810        int ret;
 811
 812        ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL);
 813        if (ret) {
 814                pr_err("percpu_ref init failed!\n");
 815                return ret;
 816        }
 817        init_completion(&sq->free_done);
 818        init_completion(&sq->confirm_done);
 819
 820        return 0;
 821}
 822EXPORT_SYMBOL_GPL(nvmet_sq_init);
 823
 824static inline u16 nvmet_check_ana_state(struct nvmet_port *port,
 825                struct nvmet_ns *ns)
 826{
 827        enum nvme_ana_state state = port->ana_state[ns->anagrpid];
 828
 829        if (unlikely(state == NVME_ANA_INACCESSIBLE))
 830                return NVME_SC_ANA_INACCESSIBLE;
 831        if (unlikely(state == NVME_ANA_PERSISTENT_LOSS))
 832                return NVME_SC_ANA_PERSISTENT_LOSS;
 833        if (unlikely(state == NVME_ANA_CHANGE))
 834                return NVME_SC_ANA_TRANSITION;
 835        return 0;
 836}
 837
 838static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req)
 839{
 840        if (unlikely(req->ns->readonly)) {
 841                switch (req->cmd->common.opcode) {
 842                case nvme_cmd_read:
 843                case nvme_cmd_flush:
 844                        break;
 845                default:
 846                        return NVME_SC_NS_WRITE_PROTECTED;
 847                }
 848        }
 849
 850        return 0;
 851}
 852
 853static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
 854{
 855        struct nvme_command *cmd = req->cmd;
 856        u16 ret;
 857
 858        ret = nvmet_check_ctrl_status(req, cmd);
 859        if (unlikely(ret))
 860                return ret;
 861
 862        if (nvmet_req_passthru_ctrl(req))
 863                return nvmet_parse_passthru_io_cmd(req);
 864
 865        req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
 866        if (unlikely(!req->ns)) {
 867                req->error_loc = offsetof(struct nvme_common_command, nsid);
 868                return NVME_SC_INVALID_NS | NVME_SC_DNR;
 869        }
 870        ret = nvmet_check_ana_state(req->port, req->ns);
 871        if (unlikely(ret)) {
 872                req->error_loc = offsetof(struct nvme_common_command, nsid);
 873                return ret;
 874        }
 875        ret = nvmet_io_cmd_check_access(req);
 876        if (unlikely(ret)) {
 877                req->error_loc = offsetof(struct nvme_common_command, nsid);
 878                return ret;
 879        }
 880
 881        if (req->ns->file)
 882                return nvmet_file_parse_io_cmd(req);
 883        else
 884                return nvmet_bdev_parse_io_cmd(req);
 885}
 886
 887bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 888                struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops)
 889{
 890        u8 flags = req->cmd->common.flags;
 891        u16 status;
 892
 893        req->cq = cq;
 894        req->sq = sq;
 895        req->ops = ops;
 896        req->sg = NULL;
 897        req->metadata_sg = NULL;
 898        req->sg_cnt = 0;
 899        req->metadata_sg_cnt = 0;
 900        req->transfer_len = 0;
 901        req->metadata_len = 0;
 902        req->cqe->status = 0;
 903        req->cqe->sq_head = 0;
 904        req->ns = NULL;
 905        req->error_loc = NVMET_NO_ERROR_LOC;
 906        req->error_slba = 0;
 907
 908        /* no support for fused commands yet */
 909        if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
 910                req->error_loc = offsetof(struct nvme_common_command, flags);
 911                status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 912                goto fail;
 913        }
 914
 915        /*
 916         * For fabrics, PSDT field shall describe metadata pointer (MPTR) that
 917         * contains an address of a single contiguous physical buffer that is
 918         * byte aligned.
 919         */
 920        if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF)) {
 921                req->error_loc = offsetof(struct nvme_common_command, flags);
 922                status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 923                goto fail;
 924        }
 925
 926        if (unlikely(!req->sq->ctrl))
 927                /* will return an error for any non-connect command: */
 928                status = nvmet_parse_connect_cmd(req);
 929        else if (likely(req->sq->qid != 0))
 930                status = nvmet_parse_io_cmd(req);
 931        else
 932                status = nvmet_parse_admin_cmd(req);
 933
 934        if (status)
 935                goto fail;
 936
 937        trace_nvmet_req_init(req, req->cmd);
 938
 939        if (unlikely(!percpu_ref_tryget_live(&sq->ref))) {
 940                status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 941                goto fail;
 942        }
 943
 944        if (sq->ctrl)
 945                sq->ctrl->cmd_seen = true;
 946
 947        return true;
 948
 949fail:
 950        __nvmet_req_complete(req, status);
 951        return false;
 952}
 953EXPORT_SYMBOL_GPL(nvmet_req_init);
 954
 955void nvmet_req_uninit(struct nvmet_req *req)
 956{
 957        percpu_ref_put(&req->sq->ref);
 958        if (req->ns)
 959                nvmet_put_namespace(req->ns);
 960}
 961EXPORT_SYMBOL_GPL(nvmet_req_uninit);
 962
 963bool nvmet_check_transfer_len(struct nvmet_req *req, size_t len)
 964{
 965        if (unlikely(len != req->transfer_len)) {
 966                req->error_loc = offsetof(struct nvme_common_command, dptr);
 967                nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR);
 968                return false;
 969        }
 970
 971        return true;
 972}
 973EXPORT_SYMBOL_GPL(nvmet_check_transfer_len);
 974
 975bool nvmet_check_data_len_lte(struct nvmet_req *req, size_t data_len)
 976{
 977        if (unlikely(data_len > req->transfer_len)) {
 978                req->error_loc = offsetof(struct nvme_common_command, dptr);
 979                nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR);
 980                return false;
 981        }
 982
 983        return true;
 984}
 985
 986static unsigned int nvmet_data_transfer_len(struct nvmet_req *req)
 987{
 988        return req->transfer_len - req->metadata_len;
 989}
 990
 991static int nvmet_req_alloc_p2pmem_sgls(struct nvmet_req *req)
 992{
 993        req->sg = pci_p2pmem_alloc_sgl(req->p2p_dev, &req->sg_cnt,
 994                        nvmet_data_transfer_len(req));
 995        if (!req->sg)
 996                goto out_err;
 997
 998        if (req->metadata_len) {
 999                req->metadata_sg = pci_p2pmem_alloc_sgl(req->p2p_dev,
1000                                &req->metadata_sg_cnt, req->metadata_len);
1001                if (!req->metadata_sg)
1002                        goto out_free_sg;
1003        }
1004        return 0;
1005out_free_sg:
1006        pci_p2pmem_free_sgl(req->p2p_dev, req->sg);
1007out_err:
1008        return -ENOMEM;
1009}
1010
1011static bool nvmet_req_find_p2p_dev(struct nvmet_req *req)
1012{
1013        if (!IS_ENABLED(CONFIG_PCI_P2PDMA))
1014                return false;
1015
1016        if (req->sq->ctrl && req->sq->qid && req->ns) {
1017                req->p2p_dev = radix_tree_lookup(&req->sq->ctrl->p2p_ns_map,
1018                                                 req->ns->nsid);
1019                if (req->p2p_dev)
1020                        return true;
1021        }
1022
1023        req->p2p_dev = NULL;
1024        return false;
1025}
1026
1027int nvmet_req_alloc_sgls(struct nvmet_req *req)
1028{
1029        if (nvmet_req_find_p2p_dev(req) && !nvmet_req_alloc_p2pmem_sgls(req))
1030                return 0;
1031
1032        req->sg = sgl_alloc(nvmet_data_transfer_len(req), GFP_KERNEL,
1033                            &req->sg_cnt);
1034        if (unlikely(!req->sg))
1035                goto out;
1036
1037        if (req->metadata_len) {
1038                req->metadata_sg = sgl_alloc(req->metadata_len, GFP_KERNEL,
1039                                             &req->metadata_sg_cnt);
1040                if (unlikely(!req->metadata_sg))
1041                        goto out_free;
1042        }
1043
1044        return 0;
1045out_free:
1046        sgl_free(req->sg);
1047out:
1048        return -ENOMEM;
1049}
1050EXPORT_SYMBOL_GPL(nvmet_req_alloc_sgls);
1051
1052void nvmet_req_free_sgls(struct nvmet_req *req)
1053{
1054        if (req->p2p_dev) {
1055                pci_p2pmem_free_sgl(req->p2p_dev, req->sg);
1056                if (req->metadata_sg)
1057                        pci_p2pmem_free_sgl(req->p2p_dev, req->metadata_sg);
1058        } else {
1059                sgl_free(req->sg);
1060                if (req->metadata_sg)
1061                        sgl_free(req->metadata_sg);
1062        }
1063
1064        req->sg = NULL;
1065        req->metadata_sg = NULL;
1066        req->sg_cnt = 0;
1067        req->metadata_sg_cnt = 0;
1068}
1069EXPORT_SYMBOL_GPL(nvmet_req_free_sgls);
1070
1071static inline bool nvmet_cc_en(u32 cc)
1072{
1073        return (cc >> NVME_CC_EN_SHIFT) & 0x1;
1074}
1075
1076static inline u8 nvmet_cc_css(u32 cc)
1077{
1078        return (cc >> NVME_CC_CSS_SHIFT) & 0x7;
1079}
1080
1081static inline u8 nvmet_cc_mps(u32 cc)
1082{
1083        return (cc >> NVME_CC_MPS_SHIFT) & 0xf;
1084}
1085
1086static inline u8 nvmet_cc_ams(u32 cc)
1087{
1088        return (cc >> NVME_CC_AMS_SHIFT) & 0x7;
1089}
1090
1091static inline u8 nvmet_cc_shn(u32 cc)
1092{
1093        return (cc >> NVME_CC_SHN_SHIFT) & 0x3;
1094}
1095
1096static inline u8 nvmet_cc_iosqes(u32 cc)
1097{
1098        return (cc >> NVME_CC_IOSQES_SHIFT) & 0xf;
1099}
1100
1101static inline u8 nvmet_cc_iocqes(u32 cc)
1102{
1103        return (cc >> NVME_CC_IOCQES_SHIFT) & 0xf;
1104}
1105
1106static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
1107{
1108        lockdep_assert_held(&ctrl->lock);
1109
1110        if (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES ||
1111            nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES ||
1112            nvmet_cc_mps(ctrl->cc) != 0 ||
1113            nvmet_cc_ams(ctrl->cc) != 0 ||
1114            nvmet_cc_css(ctrl->cc) != 0) {
1115                ctrl->csts = NVME_CSTS_CFS;
1116                return;
1117        }
1118
1119        ctrl->csts = NVME_CSTS_RDY;
1120
1121        /*
1122         * Controllers that are not yet enabled should not really enforce the
1123         * keep alive timeout, but we still want to track a timeout and cleanup
1124         * in case a host died before it enabled the controller.  Hence, simply
1125         * reset the keep alive timer when the controller is enabled.
1126         */
1127        if (ctrl->kato)
1128                mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ);
1129}
1130
1131static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl)
1132{
1133        lockdep_assert_held(&ctrl->lock);
1134
1135        /* XXX: tear down queues? */
1136        ctrl->csts &= ~NVME_CSTS_RDY;
1137        ctrl->cc = 0;
1138}
1139
1140void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new)
1141{
1142        u32 old;
1143
1144        mutex_lock(&ctrl->lock);
1145        old = ctrl->cc;
1146        ctrl->cc = new;
1147
1148        if (nvmet_cc_en(new) && !nvmet_cc_en(old))
1149                nvmet_start_ctrl(ctrl);
1150        if (!nvmet_cc_en(new) && nvmet_cc_en(old))
1151                nvmet_clear_ctrl(ctrl);
1152        if (nvmet_cc_shn(new) && !nvmet_cc_shn(old)) {
1153                nvmet_clear_ctrl(ctrl);
1154                ctrl->csts |= NVME_CSTS_SHST_CMPLT;
1155        }
1156        if (!nvmet_cc_shn(new) && nvmet_cc_shn(old))
1157                ctrl->csts &= ~NVME_CSTS_SHST_CMPLT;
1158        mutex_unlock(&ctrl->lock);
1159}
1160
1161static void nvmet_init_cap(struct nvmet_ctrl *ctrl)
1162{
1163        /* command sets supported: NVMe command set: */
1164        ctrl->cap = (1ULL << 37);
1165        /* CC.EN timeout in 500msec units: */
1166        ctrl->cap |= (15ULL << 24);
1167        /* maximum queue entries supported: */
1168        ctrl->cap |= NVMET_QUEUE_SIZE - 1;
1169}
1170
1171u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid,
1172                struct nvmet_req *req, struct nvmet_ctrl **ret)
1173{
1174        struct nvmet_subsys *subsys;
1175        struct nvmet_ctrl *ctrl;
1176        u16 status = 0;
1177
1178        subsys = nvmet_find_get_subsys(req->port, subsysnqn);
1179        if (!subsys) {
1180                pr_warn("connect request for invalid subsystem %s!\n",
1181                        subsysnqn);
1182                req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn);
1183                return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1184        }
1185
1186        mutex_lock(&subsys->lock);
1187        list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
1188                if (ctrl->cntlid == cntlid) {
1189                        if (strncmp(hostnqn, ctrl->hostnqn, NVMF_NQN_SIZE)) {
1190                                pr_warn("hostnqn mismatch.\n");
1191                                continue;
1192                        }
1193                        if (!kref_get_unless_zero(&ctrl->ref))
1194                                continue;
1195
1196                        *ret = ctrl;
1197                        goto out;
1198                }
1199        }
1200
1201        pr_warn("could not find controller %d for subsys %s / host %s\n",
1202                cntlid, subsysnqn, hostnqn);
1203        req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid);
1204        status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1205
1206out:
1207        mutex_unlock(&subsys->lock);
1208        nvmet_subsys_put(subsys);
1209        return status;
1210}
1211
1212u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd)
1213{
1214        if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
1215                pr_err("got cmd %d while CC.EN == 0 on qid = %d\n",
1216                       cmd->common.opcode, req->sq->qid);
1217                return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
1218        }
1219
1220        if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
1221                pr_err("got cmd %d while CSTS.RDY == 0 on qid = %d\n",
1222                       cmd->common.opcode, req->sq->qid);
1223                return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
1224        }
1225        return 0;
1226}
1227
1228bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn)
1229{
1230        struct nvmet_host_link *p;
1231
1232        lockdep_assert_held(&nvmet_config_sem);
1233
1234        if (subsys->allow_any_host)
1235                return true;
1236
1237        if (subsys->type == NVME_NQN_DISC) /* allow all access to disc subsys */
1238                return true;
1239
1240        list_for_each_entry(p, &subsys->hosts, entry) {
1241                if (!strcmp(nvmet_host_name(p->host), hostnqn))
1242                        return true;
1243        }
1244
1245        return false;
1246}
1247
1248/*
1249 * Note: ctrl->subsys->lock should be held when calling this function
1250 */
1251static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl,
1252                struct nvmet_req *req)
1253{
1254        struct nvmet_ns *ns;
1255        unsigned long idx;
1256
1257        if (!req->p2p_client)
1258                return;
1259
1260        ctrl->p2p_client = get_device(req->p2p_client);
1261
1262        xa_for_each(&ctrl->subsys->namespaces, idx, ns)
1263                nvmet_p2pmem_ns_add_p2p(ctrl, ns);
1264}
1265
1266/*
1267 * Note: ctrl->subsys->lock should be held when calling this function
1268 */
1269static void nvmet_release_p2p_ns_map(struct nvmet_ctrl *ctrl)
1270{
1271        struct radix_tree_iter iter;
1272        void __rcu **slot;
1273
1274        radix_tree_for_each_slot(slot, &ctrl->p2p_ns_map, &iter, 0)
1275                pci_dev_put(radix_tree_deref_slot(slot));
1276
1277        put_device(ctrl->p2p_client);
1278}
1279
1280static void nvmet_fatal_error_handler(struct work_struct *work)
1281{
1282        struct nvmet_ctrl *ctrl =
1283                        container_of(work, struct nvmet_ctrl, fatal_err_work);
1284
1285        pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid);
1286        ctrl->ops->delete_ctrl(ctrl);
1287}
1288
1289u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
1290                struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp)
1291{
1292        struct nvmet_subsys *subsys;
1293        struct nvmet_ctrl *ctrl;
1294        int ret;
1295        u16 status;
1296
1297        status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1298        subsys = nvmet_find_get_subsys(req->port, subsysnqn);
1299        if (!subsys) {
1300                pr_warn("connect request for invalid subsystem %s!\n",
1301                        subsysnqn);
1302                req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn);
1303                goto out;
1304        }
1305
1306        status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1307        down_read(&nvmet_config_sem);
1308        if (!nvmet_host_allowed(subsys, hostnqn)) {
1309                pr_info("connect by host %s for subsystem %s not allowed\n",
1310                        hostnqn, subsysnqn);
1311                req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn);
1312                up_read(&nvmet_config_sem);
1313                status = NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR;
1314                goto out_put_subsystem;
1315        }
1316        up_read(&nvmet_config_sem);
1317
1318        status = NVME_SC_INTERNAL;
1319        ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
1320        if (!ctrl)
1321                goto out_put_subsystem;
1322        mutex_init(&ctrl->lock);
1323
1324        nvmet_init_cap(ctrl);
1325
1326        ctrl->port = req->port;
1327
1328        INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work);
1329        INIT_LIST_HEAD(&ctrl->async_events);
1330        INIT_RADIX_TREE(&ctrl->p2p_ns_map, GFP_KERNEL);
1331        INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler);
1332
1333        memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE);
1334        memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE);
1335
1336        kref_init(&ctrl->ref);
1337        ctrl->subsys = subsys;
1338        WRITE_ONCE(ctrl->aen_enabled, NVMET_AEN_CFG_OPTIONAL);
1339
1340        ctrl->changed_ns_list = kmalloc_array(NVME_MAX_CHANGED_NAMESPACES,
1341                        sizeof(__le32), GFP_KERNEL);
1342        if (!ctrl->changed_ns_list)
1343                goto out_free_ctrl;
1344
1345        ctrl->sqs = kcalloc(subsys->max_qid + 1,
1346                        sizeof(struct nvmet_sq *),
1347                        GFP_KERNEL);
1348        if (!ctrl->sqs)
1349                goto out_free_changed_ns_list;
1350
1351        if (subsys->cntlid_min > subsys->cntlid_max)
1352                goto out_free_changed_ns_list;
1353
1354        ret = ida_simple_get(&cntlid_ida,
1355                             subsys->cntlid_min, subsys->cntlid_max,
1356                             GFP_KERNEL);
1357        if (ret < 0) {
1358                status = NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR;
1359                goto out_free_sqs;
1360        }
1361        ctrl->cntlid = ret;
1362
1363        ctrl->ops = req->ops;
1364
1365        /*
1366         * Discovery controllers may use some arbitrary high value
1367         * in order to cleanup stale discovery sessions
1368         */
1369        if ((ctrl->subsys->type == NVME_NQN_DISC) && !kato)
1370                kato = NVMET_DISC_KATO_MS;
1371
1372        /* keep-alive timeout in seconds */
1373        ctrl->kato = DIV_ROUND_UP(kato, 1000);
1374
1375        ctrl->err_counter = 0;
1376        spin_lock_init(&ctrl->error_lock);
1377
1378        nvmet_start_keep_alive_timer(ctrl);
1379
1380        mutex_lock(&subsys->lock);
1381        list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
1382        nvmet_setup_p2p_ns_map(ctrl, req);
1383        mutex_unlock(&subsys->lock);
1384
1385        *ctrlp = ctrl;
1386        return 0;
1387
1388out_free_sqs:
1389        kfree(ctrl->sqs);
1390out_free_changed_ns_list:
1391        kfree(ctrl->changed_ns_list);
1392out_free_ctrl:
1393        kfree(ctrl);
1394out_put_subsystem:
1395        nvmet_subsys_put(subsys);
1396out:
1397        return status;
1398}
1399
1400static void nvmet_ctrl_free(struct kref *ref)
1401{
1402        struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref);
1403        struct nvmet_subsys *subsys = ctrl->subsys;
1404
1405        mutex_lock(&subsys->lock);
1406        nvmet_release_p2p_ns_map(ctrl);
1407        list_del(&ctrl->subsys_entry);
1408        mutex_unlock(&subsys->lock);
1409
1410        nvmet_stop_keep_alive_timer(ctrl);
1411
1412        flush_work(&ctrl->async_event_work);
1413        cancel_work_sync(&ctrl->fatal_err_work);
1414
1415        ida_simple_remove(&cntlid_ida, ctrl->cntlid);
1416
1417        nvmet_async_events_free(ctrl);
1418        kfree(ctrl->sqs);
1419        kfree(ctrl->changed_ns_list);
1420        kfree(ctrl);
1421
1422        nvmet_subsys_put(subsys);
1423}
1424
1425void nvmet_ctrl_put(struct nvmet_ctrl *ctrl)
1426{
1427        kref_put(&ctrl->ref, nvmet_ctrl_free);
1428}
1429
1430void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl)
1431{
1432        mutex_lock(&ctrl->lock);
1433        if (!(ctrl->csts & NVME_CSTS_CFS)) {
1434                ctrl->csts |= NVME_CSTS_CFS;
1435                schedule_work(&ctrl->fatal_err_work);
1436        }
1437        mutex_unlock(&ctrl->lock);
1438}
1439EXPORT_SYMBOL_GPL(nvmet_ctrl_fatal_error);
1440
1441static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
1442                const char *subsysnqn)
1443{
1444        struct nvmet_subsys_link *p;
1445
1446        if (!port)
1447                return NULL;
1448
1449        if (!strcmp(NVME_DISC_SUBSYS_NAME, subsysnqn)) {
1450                if (!kref_get_unless_zero(&nvmet_disc_subsys->ref))
1451                        return NULL;
1452                return nvmet_disc_subsys;
1453        }
1454
1455        down_read(&nvmet_config_sem);
1456        list_for_each_entry(p, &port->subsystems, entry) {
1457                if (!strncmp(p->subsys->subsysnqn, subsysnqn,
1458                                NVMF_NQN_SIZE)) {
1459                        if (!kref_get_unless_zero(&p->subsys->ref))
1460                                break;
1461                        up_read(&nvmet_config_sem);
1462                        return p->subsys;
1463                }
1464        }
1465        up_read(&nvmet_config_sem);
1466        return NULL;
1467}
1468
1469struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
1470                enum nvme_subsys_type type)
1471{
1472        struct nvmet_subsys *subsys;
1473
1474        subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
1475        if (!subsys)
1476                return ERR_PTR(-ENOMEM);
1477
1478        subsys->ver = NVMET_DEFAULT_VS;
1479        /* generate a random serial number as our controllers are ephemeral: */
1480        get_random_bytes(&subsys->serial, sizeof(subsys->serial));
1481
1482        switch (type) {
1483        case NVME_NQN_NVME:
1484                subsys->max_qid = NVMET_NR_QUEUES;
1485                break;
1486        case NVME_NQN_DISC:
1487                subsys->max_qid = 0;
1488                break;
1489        default:
1490                pr_err("%s: Unknown Subsystem type - %d\n", __func__, type);
1491                kfree(subsys);
1492                return ERR_PTR(-EINVAL);
1493        }
1494        subsys->type = type;
1495        subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE,
1496                        GFP_KERNEL);
1497        if (!subsys->subsysnqn) {
1498                kfree(subsys);
1499                return ERR_PTR(-ENOMEM);
1500        }
1501        subsys->cntlid_min = NVME_CNTLID_MIN;
1502        subsys->cntlid_max = NVME_CNTLID_MAX;
1503        kref_init(&subsys->ref);
1504
1505        mutex_init(&subsys->lock);
1506        xa_init(&subsys->namespaces);
1507        INIT_LIST_HEAD(&subsys->ctrls);
1508        INIT_LIST_HEAD(&subsys->hosts);
1509
1510        return subsys;
1511}
1512
1513static void nvmet_subsys_free(struct kref *ref)
1514{
1515        struct nvmet_subsys *subsys =
1516                container_of(ref, struct nvmet_subsys, ref);
1517
1518        WARN_ON_ONCE(!xa_empty(&subsys->namespaces));
1519
1520        xa_destroy(&subsys->namespaces);
1521        nvmet_passthru_subsys_free(subsys);
1522
1523        kfree(subsys->subsysnqn);
1524        kfree_rcu(subsys->model, rcuhead);
1525        kfree(subsys);
1526}
1527
1528void nvmet_subsys_del_ctrls(struct nvmet_subsys *subsys)
1529{
1530        struct nvmet_ctrl *ctrl;
1531
1532        mutex_lock(&subsys->lock);
1533        list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
1534                ctrl->ops->delete_ctrl(ctrl);
1535        mutex_unlock(&subsys->lock);
1536}
1537
1538void nvmet_subsys_put(struct nvmet_subsys *subsys)
1539{
1540        kref_put(&subsys->ref, nvmet_subsys_free);
1541}
1542
1543static int __init nvmet_init(void)
1544{
1545        int error;
1546
1547        nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1;
1548
1549        buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq",
1550                        WQ_MEM_RECLAIM, 0);
1551        if (!buffered_io_wq) {
1552                error = -ENOMEM;
1553                goto out;
1554        }
1555
1556        error = nvmet_init_discovery();
1557        if (error)
1558                goto out_free_work_queue;
1559
1560        error = nvmet_init_configfs();
1561        if (error)
1562                goto out_exit_discovery;
1563        return 0;
1564
1565out_exit_discovery:
1566        nvmet_exit_discovery();
1567out_free_work_queue:
1568        destroy_workqueue(buffered_io_wq);
1569out:
1570        return error;
1571}
1572
1573static void __exit nvmet_exit(void)
1574{
1575        nvmet_exit_configfs();
1576        nvmet_exit_discovery();
1577        ida_destroy(&cntlid_ida);
1578        destroy_workqueue(buffered_io_wq);
1579
1580        BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024);
1581        BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024);
1582}
1583
1584module_init(nvmet_init);
1585module_exit(nvmet_exit);
1586
1587MODULE_LICENSE("GPL v2");
1588