linux/drivers/nvme/host/core.c
<<
>>
Prefs
   1/*
   2 * NVM Express device driver
   3 * Copyright (c) 2011-2014, Intel Corporation.
   4 *
   5 * This program is free software; you can redistribute it and/or modify it
   6 * under the terms and conditions of the GNU General Public License,
   7 * version 2, as published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope it will be useful, but WITHOUT
  10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  12 * more details.
  13 */
  14
  15#include <linux/blkdev.h>
  16#include <linux/blk-mq.h>
  17#include <linux/delay.h>
  18#include <linux/errno.h>
  19#include <linux/hdreg.h>
  20#include <linux/kernel.h>
  21#include <linux/module.h>
  22#include <linux/list_sort.h>
  23#include <linux/slab.h>
  24#include <linux/types.h>
  25#include <linux/pr.h>
  26#include <linux/ptrace.h>
  27#include <linux/nvme_ioctl.h>
  28#include <linux/t10-pi.h>
  29#include <linux/pm_qos.h>
  30#include <asm/unaligned.h>
  31
  32#define CREATE_TRACE_POINTS
  33#include "trace.h"
  34
  35#include "nvme.h"
  36#include "fabrics.h"
  37
  38#define NVME_MINORS             (1U << MINORBITS)
  39
  40unsigned int admin_timeout = 60;
  41module_param(admin_timeout, uint, 0644);
  42MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
  43EXPORT_SYMBOL_GPL(admin_timeout);
  44
  45unsigned int nvme_io_timeout = 30;
  46module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
  47MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
  48EXPORT_SYMBOL_GPL(nvme_io_timeout);
  49
  50static unsigned char shutdown_timeout = 5;
  51module_param(shutdown_timeout, byte, 0644);
  52MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
  53
  54static u8 nvme_max_retries = 5;
  55module_param_named(max_retries, nvme_max_retries, byte, 0644);
  56MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
  57
  58static unsigned long default_ps_max_latency_us = 100000;
  59module_param(default_ps_max_latency_us, ulong, 0644);
  60MODULE_PARM_DESC(default_ps_max_latency_us,
  61                 "max power saving latency for new devices; use PM QOS to change per device");
  62
  63static bool force_apst;
  64module_param(force_apst, bool, 0644);
  65MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
  66
  67static bool streams;
  68module_param(streams, bool, 0644);
  69MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
  70
  71/*
  72 * nvme_wq - hosts nvme related works that are not reset or delete
  73 * nvme_reset_wq - hosts nvme reset works
  74 * nvme_delete_wq - hosts nvme delete works
  75 *
  76 * nvme_wq will host works such are scan, aen handling, fw activation,
  77 * keep-alive error recovery, periodic reconnects etc. nvme_reset_wq
  78 * runs reset works which also flush works hosted on nvme_wq for
  79 * serialization purposes. nvme_delete_wq host controller deletion
  80 * works which flush reset works for serialization.
  81 */
  82struct workqueue_struct *nvme_wq;
  83EXPORT_SYMBOL_GPL(nvme_wq);
  84
  85struct workqueue_struct *nvme_reset_wq;
  86EXPORT_SYMBOL_GPL(nvme_reset_wq);
  87
  88struct workqueue_struct *nvme_delete_wq;
  89EXPORT_SYMBOL_GPL(nvme_delete_wq);
  90
  91static DEFINE_IDA(nvme_subsystems_ida);
  92static LIST_HEAD(nvme_subsystems);
  93static DEFINE_MUTEX(nvme_subsystems_lock);
  94
  95static DEFINE_IDA(nvme_instance_ida);
  96static dev_t nvme_chr_devt;
  97static struct class *nvme_class;
  98static struct class *nvme_subsys_class;
  99
 100static void nvme_ns_remove(struct nvme_ns *ns);
 101static int nvme_revalidate_disk(struct gendisk *disk);
 102
 103static __le32 nvme_get_log_dw10(u8 lid, size_t size)
 104{
 105        return cpu_to_le32((((size / 4) - 1) << 16) | lid);
 106}
 107
 108int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
 109{
 110        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
 111                return -EBUSY;
 112        if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
 113                return -EBUSY;
 114        return 0;
 115}
 116EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
 117
 118int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
 119{
 120        int ret;
 121
 122        ret = nvme_reset_ctrl(ctrl);
 123        if (!ret) {
 124                flush_work(&ctrl->reset_work);
 125                if (ctrl->state != NVME_CTRL_LIVE)
 126                        ret = -ENETRESET;
 127        }
 128
 129        return ret;
 130}
 131EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
 132
 133static void nvme_delete_ctrl_work(struct work_struct *work)
 134{
 135        struct nvme_ctrl *ctrl =
 136                container_of(work, struct nvme_ctrl, delete_work);
 137
 138        flush_work(&ctrl->reset_work);
 139        nvme_stop_ctrl(ctrl);
 140        nvme_remove_namespaces(ctrl);
 141        ctrl->ops->delete_ctrl(ctrl);
 142        nvme_uninit_ctrl(ctrl);
 143        nvme_put_ctrl(ctrl);
 144}
 145
 146int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
 147{
 148        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
 149                return -EBUSY;
 150        if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
 151                return -EBUSY;
 152        return 0;
 153}
 154EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
 155
 156int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
 157{
 158        int ret = 0;
 159
 160        /*
 161         * Keep a reference until the work is flushed since ->delete_ctrl
 162         * can free the controller.
 163         */
 164        nvme_get_ctrl(ctrl);
 165        ret = nvme_delete_ctrl(ctrl);
 166        if (!ret)
 167                flush_work(&ctrl->delete_work);
 168        nvme_put_ctrl(ctrl);
 169        return ret;
 170}
 171EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync);
 172
 173static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
 174{
 175        return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple);
 176}
 177
 178static blk_status_t nvme_error_status(struct request *req)
 179{
 180        switch (nvme_req(req)->status & 0x7ff) {
 181        case NVME_SC_SUCCESS:
 182                return BLK_STS_OK;
 183        case NVME_SC_CAP_EXCEEDED:
 184                return BLK_STS_NOSPC;
 185        case NVME_SC_LBA_RANGE:
 186                return BLK_STS_TARGET;
 187        case NVME_SC_BAD_ATTRIBUTES:
 188        case NVME_SC_ONCS_NOT_SUPPORTED:
 189        case NVME_SC_INVALID_OPCODE:
 190        case NVME_SC_INVALID_FIELD:
 191        case NVME_SC_INVALID_NS:
 192                return BLK_STS_NOTSUPP;
 193        case NVME_SC_WRITE_FAULT:
 194        case NVME_SC_READ_ERROR:
 195        case NVME_SC_UNWRITTEN_BLOCK:
 196        case NVME_SC_ACCESS_DENIED:
 197        case NVME_SC_READ_ONLY:
 198        case NVME_SC_COMPARE_FAILED:
 199                return BLK_STS_MEDIUM;
 200        case NVME_SC_GUARD_CHECK:
 201        case NVME_SC_APPTAG_CHECK:
 202        case NVME_SC_REFTAG_CHECK:
 203        case NVME_SC_INVALID_PI:
 204                return BLK_STS_PROTECTION;
 205        case NVME_SC_RESERVATION_CONFLICT:
 206                return BLK_STS_NEXUS;
 207        default:
 208                return BLK_STS_IOERR;
 209        }
 210}
 211
 212static inline bool nvme_req_needs_retry(struct request *req)
 213{
 214        if (blk_noretry_request(req))
 215                return false;
 216        if (nvme_req(req)->status & NVME_SC_DNR)
 217                return false;
 218        if (nvme_req(req)->retries >= nvme_max_retries)
 219                return false;
 220        return true;
 221}
 222
 223void nvme_complete_rq(struct request *req)
 224{
 225        blk_status_t status = nvme_error_status(req);
 226
 227        trace_nvme_complete_rq(req);
 228
 229        if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
 230                if (nvme_req_needs_failover(req, status)) {
 231                        nvme_failover_req(req);
 232                        return;
 233                }
 234
 235                if (!blk_queue_dying(req->q)) {
 236                        nvme_req(req)->retries++;
 237                        blk_mq_requeue_request(req, true);
 238                        return;
 239                }
 240        }
 241        blk_mq_end_request(req, status);
 242}
 243EXPORT_SYMBOL_GPL(nvme_complete_rq);
 244
 245void nvme_cancel_request(struct request *req, void *data, bool reserved)
 246{
 247        if (!blk_mq_request_started(req))
 248                return;
 249
 250        dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
 251                                "Cancelling I/O %d", req->tag);
 252
 253        nvme_req(req)->status = NVME_SC_ABORT_REQ;
 254        blk_mq_complete_request(req);
 255
 256}
 257EXPORT_SYMBOL_GPL(nvme_cancel_request);
 258
 259bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 260                enum nvme_ctrl_state new_state)
 261{
 262        enum nvme_ctrl_state old_state;
 263        unsigned long flags;
 264        bool changed = false;
 265
 266        spin_lock_irqsave(&ctrl->lock, flags);
 267
 268        old_state = ctrl->state;
 269        switch (new_state) {
 270        case NVME_CTRL_ADMIN_ONLY:
 271                switch (old_state) {
 272                case NVME_CTRL_CONNECTING:
 273                        changed = true;
 274                        /* FALLTHRU */
 275                default:
 276                        break;
 277                }
 278                break;
 279        case NVME_CTRL_LIVE:
 280                switch (old_state) {
 281                case NVME_CTRL_NEW:
 282                case NVME_CTRL_RESETTING:
 283                case NVME_CTRL_CONNECTING:
 284                        changed = true;
 285                        /* FALLTHRU */
 286                default:
 287                        break;
 288                }
 289                break;
 290        case NVME_CTRL_RESETTING:
 291                switch (old_state) {
 292                case NVME_CTRL_NEW:
 293                case NVME_CTRL_LIVE:
 294                case NVME_CTRL_ADMIN_ONLY:
 295                        changed = true;
 296                        /* FALLTHRU */
 297                default:
 298                        break;
 299                }
 300                break;
 301        case NVME_CTRL_CONNECTING:
 302                switch (old_state) {
 303                case NVME_CTRL_NEW:
 304                case NVME_CTRL_RESETTING:
 305                        changed = true;
 306                        /* FALLTHRU */
 307                default:
 308                        break;
 309                }
 310                break;
 311        case NVME_CTRL_DELETING:
 312                switch (old_state) {
 313                case NVME_CTRL_LIVE:
 314                case NVME_CTRL_ADMIN_ONLY:
 315                case NVME_CTRL_RESETTING:
 316                case NVME_CTRL_CONNECTING:
 317                        changed = true;
 318                        /* FALLTHRU */
 319                default:
 320                        break;
 321                }
 322                break;
 323        case NVME_CTRL_DEAD:
 324                switch (old_state) {
 325                case NVME_CTRL_DELETING:
 326                        changed = true;
 327                        /* FALLTHRU */
 328                default:
 329                        break;
 330                }
 331                break;
 332        default:
 333                break;
 334        }
 335
 336        if (changed)
 337                ctrl->state = new_state;
 338
 339        spin_unlock_irqrestore(&ctrl->lock, flags);
 340        if (changed && ctrl->state == NVME_CTRL_LIVE)
 341                nvme_kick_requeue_lists(ctrl);
 342        return changed;
 343}
 344EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
 345
 346static void nvme_free_ns_head(struct kref *ref)
 347{
 348        struct nvme_ns_head *head =
 349                container_of(ref, struct nvme_ns_head, ref);
 350
 351        nvme_mpath_remove_disk(head);
 352        ida_simple_remove(&head->subsys->ns_ida, head->instance);
 353        list_del_init(&head->entry);
 354        cleanup_srcu_struct(&head->srcu);
 355        kfree(head);
 356}
 357
 358static void nvme_put_ns_head(struct nvme_ns_head *head)
 359{
 360        kref_put(&head->ref, nvme_free_ns_head);
 361}
 362
 363static void nvme_free_ns(struct kref *kref)
 364{
 365        struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
 366
 367        if (ns->ndev)
 368                nvme_nvm_unregister(ns);
 369
 370        put_disk(ns->disk);
 371        nvme_put_ns_head(ns->head);
 372        nvme_put_ctrl(ns->ctrl);
 373        kfree(ns);
 374}
 375
 376static void nvme_put_ns(struct nvme_ns *ns)
 377{
 378        kref_put(&ns->kref, nvme_free_ns);
 379}
 380
 381struct request *nvme_alloc_request(struct request_queue *q,
 382                struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid)
 383{
 384        unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
 385        struct request *req;
 386
 387        if (qid == NVME_QID_ANY) {
 388                req = blk_mq_alloc_request(q, op, flags);
 389        } else {
 390                req = blk_mq_alloc_request_hctx(q, op, flags,
 391                                qid ? qid - 1 : 0);
 392        }
 393        if (IS_ERR(req))
 394                return req;
 395
 396        req->cmd_flags |= REQ_FAILFAST_DRIVER;
 397        nvme_req(req)->cmd = cmd;
 398
 399        return req;
 400}
 401EXPORT_SYMBOL_GPL(nvme_alloc_request);
 402
 403static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable)
 404{
 405        struct nvme_command c;
 406
 407        memset(&c, 0, sizeof(c));
 408
 409        c.directive.opcode = nvme_admin_directive_send;
 410        c.directive.nsid = cpu_to_le32(NVME_NSID_ALL);
 411        c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE;
 412        c.directive.dtype = NVME_DIR_IDENTIFY;
 413        c.directive.tdtype = NVME_DIR_STREAMS;
 414        c.directive.endir = enable ? NVME_DIR_ENDIR : 0;
 415
 416        return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0);
 417}
 418
 419static int nvme_disable_streams(struct nvme_ctrl *ctrl)
 420{
 421        return nvme_toggle_streams(ctrl, false);
 422}
 423
 424static int nvme_enable_streams(struct nvme_ctrl *ctrl)
 425{
 426        return nvme_toggle_streams(ctrl, true);
 427}
 428
 429static int nvme_get_stream_params(struct nvme_ctrl *ctrl,
 430                                  struct streams_directive_params *s, u32 nsid)
 431{
 432        struct nvme_command c;
 433
 434        memset(&c, 0, sizeof(c));
 435        memset(s, 0, sizeof(*s));
 436
 437        c.directive.opcode = nvme_admin_directive_recv;
 438        c.directive.nsid = cpu_to_le32(nsid);
 439        c.directive.numd = cpu_to_le32((sizeof(*s) >> 2) - 1);
 440        c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
 441        c.directive.dtype = NVME_DIR_STREAMS;
 442
 443        return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s));
 444}
 445
 446static int nvme_configure_directives(struct nvme_ctrl *ctrl)
 447{
 448        struct streams_directive_params s;
 449        int ret;
 450
 451        if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
 452                return 0;
 453        if (!streams)
 454                return 0;
 455
 456        ret = nvme_enable_streams(ctrl);
 457        if (ret)
 458                return ret;
 459
 460        ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL);
 461        if (ret)
 462                return ret;
 463
 464        ctrl->nssa = le16_to_cpu(s.nssa);
 465        if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) {
 466                dev_info(ctrl->device, "too few streams (%u) available\n",
 467                                        ctrl->nssa);
 468                nvme_disable_streams(ctrl);
 469                return 0;
 470        }
 471
 472        ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
 473        dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams);
 474        return 0;
 475}
 476
 477/*
 478 * Check if 'req' has a write hint associated with it. If it does, assign
 479 * a valid namespace stream to the write.
 480 */
 481static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
 482                                     struct request *req, u16 *control,
 483                                     u32 *dsmgmt)
 484{
 485        enum rw_hint streamid = req->write_hint;
 486
 487        if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE)
 488                streamid = 0;
 489        else {
 490                streamid--;
 491                if (WARN_ON_ONCE(streamid > ctrl->nr_streams))
 492                        return;
 493
 494                *control |= NVME_RW_DTYPE_STREAMS;
 495                *dsmgmt |= streamid << 16;
 496        }
 497
 498        if (streamid < ARRAY_SIZE(req->q->write_hints))
 499                req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
 500}
 501
 502static inline void nvme_setup_flush(struct nvme_ns *ns,
 503                struct nvme_command *cmnd)
 504{
 505        memset(cmnd, 0, sizeof(*cmnd));
 506        cmnd->common.opcode = nvme_cmd_flush;
 507        cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
 508}
 509
 510static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 511                struct nvme_command *cmnd)
 512{
 513        unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
 514        struct nvme_dsm_range *range;
 515        struct bio *bio;
 516
 517        range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
 518        if (!range)
 519                return BLK_STS_RESOURCE;
 520
 521        __rq_for_each_bio(bio, req) {
 522                u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
 523                u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
 524
 525                if (n < segments) {
 526                        range[n].cattr = cpu_to_le32(0);
 527                        range[n].nlb = cpu_to_le32(nlb);
 528                        range[n].slba = cpu_to_le64(slba);
 529                }
 530                n++;
 531        }
 532
 533        if (WARN_ON_ONCE(n != segments)) {
 534                kfree(range);
 535                return BLK_STS_IOERR;
 536        }
 537
 538        memset(cmnd, 0, sizeof(*cmnd));
 539        cmnd->dsm.opcode = nvme_cmd_dsm;
 540        cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
 541        cmnd->dsm.nr = cpu_to_le32(segments - 1);
 542        cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
 543
 544        req->special_vec.bv_page = virt_to_page(range);
 545        req->special_vec.bv_offset = offset_in_page(range);
 546        req->special_vec.bv_len = sizeof(*range) * segments;
 547        req->rq_flags |= RQF_SPECIAL_PAYLOAD;
 548
 549        return BLK_STS_OK;
 550}
 551
 552static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 553                struct request *req, struct nvme_command *cmnd)
 554{
 555        struct nvme_ctrl *ctrl = ns->ctrl;
 556        u16 control = 0;
 557        u32 dsmgmt = 0;
 558
 559        if (req->cmd_flags & REQ_FUA)
 560                control |= NVME_RW_FUA;
 561        if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
 562                control |= NVME_RW_LR;
 563
 564        if (req->cmd_flags & REQ_RAHEAD)
 565                dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 566
 567        memset(cmnd, 0, sizeof(*cmnd));
 568        cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
 569        cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
 570        cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
 571        cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
 572
 573        if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
 574                nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
 575
 576        if (ns->ms) {
 577                /*
 578                 * If formated with metadata, the block layer always provides a
 579                 * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled.  Else
 580                 * we enable the PRACT bit for protection information or set the
 581                 * namespace capacity to zero to prevent any I/O.
 582                 */
 583                if (!blk_integrity_rq(req)) {
 584                        if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
 585                                return BLK_STS_NOTSUPP;
 586                        control |= NVME_RW_PRINFO_PRACT;
 587                }
 588
 589                switch (ns->pi_type) {
 590                case NVME_NS_DPS_PI_TYPE3:
 591                        control |= NVME_RW_PRINFO_PRCHK_GUARD;
 592                        break;
 593                case NVME_NS_DPS_PI_TYPE1:
 594                case NVME_NS_DPS_PI_TYPE2:
 595                        control |= NVME_RW_PRINFO_PRCHK_GUARD |
 596                                        NVME_RW_PRINFO_PRCHK_REF;
 597                        cmnd->rw.reftag = cpu_to_le32(
 598                                        nvme_block_nr(ns, blk_rq_pos(req)));
 599                        break;
 600                }
 601        }
 602
 603        cmnd->rw.control = cpu_to_le16(control);
 604        cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
 605        return 0;
 606}
 607
 608blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 609                struct nvme_command *cmd)
 610{
 611        blk_status_t ret = BLK_STS_OK;
 612
 613        if (!(req->rq_flags & RQF_DONTPREP)) {
 614                nvme_req(req)->retries = 0;
 615                nvme_req(req)->flags = 0;
 616                req->rq_flags |= RQF_DONTPREP;
 617        }
 618
 619        switch (req_op(req)) {
 620        case REQ_OP_DRV_IN:
 621        case REQ_OP_DRV_OUT:
 622                memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
 623                break;
 624        case REQ_OP_FLUSH:
 625                nvme_setup_flush(ns, cmd);
 626                break;
 627        case REQ_OP_WRITE_ZEROES:
 628                /* currently only aliased to deallocate for a few ctrls: */
 629        case REQ_OP_DISCARD:
 630                ret = nvme_setup_discard(ns, req, cmd);
 631                break;
 632        case REQ_OP_READ:
 633        case REQ_OP_WRITE:
 634                ret = nvme_setup_rw(ns, req, cmd);
 635                break;
 636        default:
 637                WARN_ON_ONCE(1);
 638                return BLK_STS_IOERR;
 639        }
 640
 641        cmd->common.command_id = req->tag;
 642        if (ns)
 643                trace_nvme_setup_nvm_cmd(req->q->id, cmd);
 644        else
 645                trace_nvme_setup_admin_cmd(cmd);
 646        return ret;
 647}
 648EXPORT_SYMBOL_GPL(nvme_setup_cmd);
 649
 650/*
 651 * Returns 0 on success.  If the result is negative, it's a Linux error code;
 652 * if the result is positive, it's an NVM Express status code
 653 */
 654int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 655                union nvme_result *result, void *buffer, unsigned bufflen,
 656                unsigned timeout, int qid, int at_head,
 657                blk_mq_req_flags_t flags)
 658{
 659        struct request *req;
 660        int ret;
 661
 662        req = nvme_alloc_request(q, cmd, flags, qid);
 663        if (IS_ERR(req))
 664                return PTR_ERR(req);
 665
 666        req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
 667
 668        if (buffer && bufflen) {
 669                ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
 670                if (ret)
 671                        goto out;
 672        }
 673
 674        blk_execute_rq(req->q, NULL, req, at_head);
 675        if (result)
 676                *result = nvme_req(req)->result;
 677        if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
 678                ret = -EINTR;
 679        else
 680                ret = nvme_req(req)->status;
 681 out:
 682        blk_mq_free_request(req);
 683        return ret;
 684}
 685EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
 686
 687int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 688                void *buffer, unsigned bufflen)
 689{
 690        return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
 691                        NVME_QID_ANY, 0, 0);
 692}
 693EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
 694
 695static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf,
 696                unsigned len, u32 seed, bool write)
 697{
 698        struct bio_integrity_payload *bip;
 699        int ret = -ENOMEM;
 700        void *buf;
 701
 702        buf = kmalloc(len, GFP_KERNEL);
 703        if (!buf)
 704                goto out;
 705
 706        ret = -EFAULT;
 707        if (write && copy_from_user(buf, ubuf, len))
 708                goto out_free_meta;
 709
 710        bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
 711        if (IS_ERR(bip)) {
 712                ret = PTR_ERR(bip);
 713                goto out_free_meta;
 714        }
 715
 716        bip->bip_iter.bi_size = len;
 717        bip->bip_iter.bi_sector = seed;
 718        ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
 719                        offset_in_page(buf));
 720        if (ret == len)
 721                return buf;
 722        ret = -ENOMEM;
 723out_free_meta:
 724        kfree(buf);
 725out:
 726        return ERR_PTR(ret);
 727}
 728
 729static int nvme_submit_user_cmd(struct request_queue *q,
 730                struct nvme_command *cmd, void __user *ubuffer,
 731                unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
 732                u32 meta_seed, u32 *result, unsigned timeout)
 733{
 734        bool write = nvme_is_write(cmd);
 735        struct nvme_ns *ns = q->queuedata;
 736        struct gendisk *disk = ns ? ns->disk : NULL;
 737        struct request *req;
 738        struct bio *bio = NULL;
 739        void *meta = NULL;
 740        int ret;
 741
 742        req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY);
 743        if (IS_ERR(req))
 744                return PTR_ERR(req);
 745
 746        req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
 747
 748        if (ubuffer && bufflen) {
 749                ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
 750                                GFP_KERNEL);
 751                if (ret)
 752                        goto out;
 753                bio = req->bio;
 754                bio->bi_disk = disk;
 755                if (disk && meta_buffer && meta_len) {
 756                        meta = nvme_add_user_metadata(bio, meta_buffer, meta_len,
 757                                        meta_seed, write);
 758                        if (IS_ERR(meta)) {
 759                                ret = PTR_ERR(meta);
 760                                goto out_unmap;
 761                        }
 762                }
 763        }
 764
 765        blk_execute_rq(req->q, disk, req, 0);
 766        if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
 767                ret = -EINTR;
 768        else
 769                ret = nvme_req(req)->status;
 770        if (result)
 771                *result = le32_to_cpu(nvme_req(req)->result.u32);
 772        if (meta && !ret && !write) {
 773                if (copy_to_user(meta_buffer, meta, meta_len))
 774                        ret = -EFAULT;
 775        }
 776        kfree(meta);
 777 out_unmap:
 778        if (bio)
 779                blk_rq_unmap_user(bio);
 780 out:
 781        blk_mq_free_request(req);
 782        return ret;
 783}
 784
 785static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
 786{
 787        struct nvme_ctrl *ctrl = rq->end_io_data;
 788
 789        blk_mq_free_request(rq);
 790
 791        if (status) {
 792                dev_err(ctrl->device,
 793                        "failed nvme_keep_alive_end_io error=%d\n",
 794                                status);
 795                return;
 796        }
 797
 798        schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
 799}
 800
 801static int nvme_keep_alive(struct nvme_ctrl *ctrl)
 802{
 803        struct request *rq;
 804
 805        rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, BLK_MQ_REQ_RESERVED,
 806                        NVME_QID_ANY);
 807        if (IS_ERR(rq))
 808                return PTR_ERR(rq);
 809
 810        rq->timeout = ctrl->kato * HZ;
 811        rq->end_io_data = ctrl;
 812
 813        blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io);
 814
 815        return 0;
 816}
 817
 818static void nvme_keep_alive_work(struct work_struct *work)
 819{
 820        struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
 821                        struct nvme_ctrl, ka_work);
 822
 823        if (nvme_keep_alive(ctrl)) {
 824                /* allocation failure, reset the controller */
 825                dev_err(ctrl->device, "keep-alive failed\n");
 826                nvme_reset_ctrl(ctrl);
 827                return;
 828        }
 829}
 830
 831void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
 832{
 833        if (unlikely(ctrl->kato == 0))
 834                return;
 835
 836        INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
 837        memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
 838        ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
 839        schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
 840}
 841EXPORT_SYMBOL_GPL(nvme_start_keep_alive);
 842
 843void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
 844{
 845        if (unlikely(ctrl->kato == 0))
 846                return;
 847
 848        cancel_delayed_work_sync(&ctrl->ka_work);
 849}
 850EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
 851
 852static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
 853{
 854        struct nvme_command c = { };
 855        int error;
 856
 857        /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
 858        c.identify.opcode = nvme_admin_identify;
 859        c.identify.cns = NVME_ID_CNS_CTRL;
 860
 861        *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
 862        if (!*id)
 863                return -ENOMEM;
 864
 865        error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
 866                        sizeof(struct nvme_id_ctrl));
 867        if (error)
 868                kfree(*id);
 869        return error;
 870}
 871
 872static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
 873                struct nvme_ns_ids *ids)
 874{
 875        struct nvme_command c = { };
 876        int status;
 877        void *data;
 878        int pos;
 879        int len;
 880
 881        c.identify.opcode = nvme_admin_identify;
 882        c.identify.nsid = cpu_to_le32(nsid);
 883        c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
 884
 885        data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
 886        if (!data)
 887                return -ENOMEM;
 888
 889        status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
 890                                      NVME_IDENTIFY_DATA_SIZE);
 891        if (status)
 892                goto free_data;
 893
 894        for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
 895                struct nvme_ns_id_desc *cur = data + pos;
 896
 897                if (cur->nidl == 0)
 898                        break;
 899
 900                switch (cur->nidt) {
 901                case NVME_NIDT_EUI64:
 902                        if (cur->nidl != NVME_NIDT_EUI64_LEN) {
 903                                dev_warn(ctrl->device,
 904                                         "ctrl returned bogus length: %d for NVME_NIDT_EUI64\n",
 905                                         cur->nidl);
 906                                goto free_data;
 907                        }
 908                        len = NVME_NIDT_EUI64_LEN;
 909                        memcpy(ids->eui64, data + pos + sizeof(*cur), len);
 910                        break;
 911                case NVME_NIDT_NGUID:
 912                        if (cur->nidl != NVME_NIDT_NGUID_LEN) {
 913                                dev_warn(ctrl->device,
 914                                         "ctrl returned bogus length: %d for NVME_NIDT_NGUID\n",
 915                                         cur->nidl);
 916                                goto free_data;
 917                        }
 918                        len = NVME_NIDT_NGUID_LEN;
 919                        memcpy(ids->nguid, data + pos + sizeof(*cur), len);
 920                        break;
 921                case NVME_NIDT_UUID:
 922                        if (cur->nidl != NVME_NIDT_UUID_LEN) {
 923                                dev_warn(ctrl->device,
 924                                         "ctrl returned bogus length: %d for NVME_NIDT_UUID\n",
 925                                         cur->nidl);
 926                                goto free_data;
 927                        }
 928                        len = NVME_NIDT_UUID_LEN;
 929                        uuid_copy(&ids->uuid, data + pos + sizeof(*cur));
 930                        break;
 931                default:
 932                        /* Skip unnkown types */
 933                        len = cur->nidl;
 934                        break;
 935                }
 936
 937                len += sizeof(*cur);
 938        }
 939free_data:
 940        kfree(data);
 941        return status;
 942}
 943
 944static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
 945{
 946        struct nvme_command c = { };
 947
 948        c.identify.opcode = nvme_admin_identify;
 949        c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST;
 950        c.identify.nsid = cpu_to_le32(nsid);
 951        return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000);
 952}
 953
 954static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl,
 955                unsigned nsid)
 956{
 957        struct nvme_id_ns *id;
 958        struct nvme_command c = { };
 959        int error;
 960
 961        /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
 962        c.identify.opcode = nvme_admin_identify;
 963        c.identify.nsid = cpu_to_le32(nsid);
 964        c.identify.cns = NVME_ID_CNS_NS;
 965
 966        id = kmalloc(sizeof(*id), GFP_KERNEL);
 967        if (!id)
 968                return NULL;
 969
 970        error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
 971        if (error) {
 972                dev_warn(ctrl->device, "Identify namespace failed\n");
 973                kfree(id);
 974                return NULL;
 975        }
 976
 977        return id;
 978}
 979
 980static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
 981                      void *buffer, size_t buflen, u32 *result)
 982{
 983        struct nvme_command c;
 984        union nvme_result res;
 985        int ret;
 986
 987        memset(&c, 0, sizeof(c));
 988        c.features.opcode = nvme_admin_set_features;
 989        c.features.fid = cpu_to_le32(fid);
 990        c.features.dword11 = cpu_to_le32(dword11);
 991
 992        ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
 993                        buffer, buflen, 0, NVME_QID_ANY, 0, 0);
 994        if (ret >= 0 && result)
 995                *result = le32_to_cpu(res.u32);
 996        return ret;
 997}
 998
 999int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
1000{
1001        u32 q_count = (*count - 1) | ((*count - 1) << 16);
1002        u32 result;
1003        int status, nr_io_queues;
1004
1005        status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
1006                        &result);
1007        if (status < 0)
1008                return status;
1009
1010        /*
1011         * Degraded controllers might return an error when setting the queue
1012         * count.  We still want to be able to bring them online and offer
1013         * access to the admin queue, as that might be only way to fix them up.
1014         */
1015        if (status > 0) {
1016                dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
1017                *count = 0;
1018        } else {
1019                nr_io_queues = min(result & 0xffff, result >> 16) + 1;
1020                *count = min(*count, nr_io_queues);
1021        }
1022
1023        return 0;
1024}
1025EXPORT_SYMBOL_GPL(nvme_set_queue_count);
1026
1027static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
1028{
1029        struct nvme_user_io io;
1030        struct nvme_command c;
1031        unsigned length, meta_len;
1032        void __user *metadata;
1033
1034        if (copy_from_user(&io, uio, sizeof(io)))
1035                return -EFAULT;
1036        if (io.flags)
1037                return -EINVAL;
1038
1039        switch (io.opcode) {
1040        case nvme_cmd_write:
1041        case nvme_cmd_read:
1042        case nvme_cmd_compare:
1043                break;
1044        default:
1045                return -EINVAL;
1046        }
1047
1048        length = (io.nblocks + 1) << ns->lba_shift;
1049        meta_len = (io.nblocks + 1) * ns->ms;
1050        metadata = (void __user *)(uintptr_t)io.metadata;
1051
1052        if (ns->ext) {
1053                length += meta_len;
1054                meta_len = 0;
1055        } else if (meta_len) {
1056                if ((io.metadata & 3) || !io.metadata)
1057                        return -EINVAL;
1058        }
1059
1060        memset(&c, 0, sizeof(c));
1061        c.rw.opcode = io.opcode;
1062        c.rw.flags = io.flags;
1063        c.rw.nsid = cpu_to_le32(ns->head->ns_id);
1064        c.rw.slba = cpu_to_le64(io.slba);
1065        c.rw.length = cpu_to_le16(io.nblocks);
1066        c.rw.control = cpu_to_le16(io.control);
1067        c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
1068        c.rw.reftag = cpu_to_le32(io.reftag);
1069        c.rw.apptag = cpu_to_le16(io.apptag);
1070        c.rw.appmask = cpu_to_le16(io.appmask);
1071
1072        return nvme_submit_user_cmd(ns->queue, &c,
1073                        (void __user *)(uintptr_t)io.addr, length,
1074                        metadata, meta_len, io.slba, NULL, 0);
1075}
1076
1077static u32 nvme_known_admin_effects(u8 opcode)
1078{
1079        switch (opcode) {
1080        case nvme_admin_format_nvm:
1081                return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
1082                                        NVME_CMD_EFFECTS_CSE_MASK;
1083        case nvme_admin_sanitize_nvm:
1084                return NVME_CMD_EFFECTS_CSE_MASK;
1085        default:
1086                break;
1087        }
1088        return 0;
1089}
1090
1091static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1092                                                                u8 opcode)
1093{
1094        u32 effects = 0;
1095
1096        if (ns) {
1097                if (ctrl->effects)
1098                        effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
1099                if (effects & ~NVME_CMD_EFFECTS_CSUPP)
1100                        dev_warn(ctrl->device,
1101                                 "IO command:%02x has unhandled effects:%08x\n",
1102                                 opcode, effects);
1103                return 0;
1104        }
1105
1106        if (ctrl->effects)
1107                effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
1108        else
1109                effects = nvme_known_admin_effects(opcode);
1110
1111        /*
1112         * For simplicity, IO to all namespaces is quiesced even if the command
1113         * effects say only one namespace is affected.
1114         */
1115        if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
1116                nvme_start_freeze(ctrl);
1117                nvme_wait_freeze(ctrl);
1118        }
1119        return effects;
1120}
1121
1122static void nvme_update_formats(struct nvme_ctrl *ctrl)
1123{
1124        struct nvme_ns *ns, *next;
1125        LIST_HEAD(rm_list);
1126
1127        mutex_lock(&ctrl->namespaces_mutex);
1128        list_for_each_entry(ns, &ctrl->namespaces, list) {
1129                if (ns->disk && nvme_revalidate_disk(ns->disk)) {
1130                        list_move_tail(&ns->list, &rm_list);
1131                }
1132        }
1133        mutex_unlock(&ctrl->namespaces_mutex);
1134
1135        list_for_each_entry_safe(ns, next, &rm_list, list)
1136                nvme_ns_remove(ns);
1137}
1138
1139static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
1140{
1141        /*
1142         * Revalidate LBA changes prior to unfreezing. This is necessary to
1143         * prevent memory corruption if a logical block size was changed by
1144         * this command.
1145         */
1146        if (effects & NVME_CMD_EFFECTS_LBCC)
1147                nvme_update_formats(ctrl);
1148        if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK))
1149                nvme_unfreeze(ctrl);
1150        if (effects & NVME_CMD_EFFECTS_CCC)
1151                nvme_init_identify(ctrl);
1152        if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC))
1153                nvme_queue_scan(ctrl);
1154}
1155
1156static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1157                        struct nvme_passthru_cmd __user *ucmd)
1158{
1159        struct nvme_passthru_cmd cmd;
1160        struct nvme_command c;
1161        unsigned timeout = 0;
1162        u32 effects;
1163        int status;
1164
1165        if (!capable(CAP_SYS_ADMIN))
1166                return -EACCES;
1167        if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
1168                return -EFAULT;
1169        if (cmd.flags)
1170                return -EINVAL;
1171
1172        memset(&c, 0, sizeof(c));
1173        c.common.opcode = cmd.opcode;
1174        c.common.flags = cmd.flags;
1175        c.common.nsid = cpu_to_le32(cmd.nsid);
1176        c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
1177        c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
1178        c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
1179        c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
1180        c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
1181        c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
1182        c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
1183        c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
1184
1185        if (cmd.timeout_ms)
1186                timeout = msecs_to_jiffies(cmd.timeout_ms);
1187
1188        effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
1189        status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
1190                        (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
1191                        (void __user *)(uintptr_t)cmd.metadata, cmd.metadata,
1192                        0, &cmd.result, timeout);
1193        nvme_passthru_end(ctrl, effects);
1194
1195        if (status >= 0) {
1196                if (put_user(cmd.result, &ucmd->result))
1197                        return -EFAULT;
1198        }
1199
1200        return status;
1201}
1202
1203/*
1204 * Issue ioctl requests on the first available path.  Note that unlike normal
1205 * block layer requests we will not retry failed request on another controller.
1206 */
1207static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
1208                struct nvme_ns_head **head, int *srcu_idx)
1209{
1210#ifdef CONFIG_NVME_MULTIPATH
1211        if (disk->fops == &nvme_ns_head_ops) {
1212                *head = disk->private_data;
1213                *srcu_idx = srcu_read_lock(&(*head)->srcu);
1214                return nvme_find_path(*head);
1215        }
1216#endif
1217        *head = NULL;
1218        *srcu_idx = -1;
1219        return disk->private_data;
1220}
1221
1222static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
1223{
1224        if (head)
1225                srcu_read_unlock(&head->srcu, idx);
1226}
1227
1228static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned cmd, unsigned long arg)
1229{
1230        switch (cmd) {
1231        case NVME_IOCTL_ID:
1232                force_successful_syscall_return();
1233                return ns->head->ns_id;
1234        case NVME_IOCTL_ADMIN_CMD:
1235                return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
1236        case NVME_IOCTL_IO_CMD:
1237                return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
1238        case NVME_IOCTL_SUBMIT_IO:
1239                return nvme_submit_io(ns, (void __user *)arg);
1240        default:
1241#ifdef CONFIG_NVM
1242                if (ns->ndev)
1243                        return nvme_nvm_ioctl(ns, cmd, arg);
1244#endif
1245                if (is_sed_ioctl(cmd))
1246                        return sed_ioctl(ns->ctrl->opal_dev, cmd,
1247                                         (void __user *) arg);
1248                return -ENOTTY;
1249        }
1250}
1251
1252static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
1253                unsigned int cmd, unsigned long arg)
1254{
1255        struct nvme_ns_head *head = NULL;
1256        struct nvme_ns *ns;
1257        int srcu_idx, ret;
1258
1259        ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
1260        if (unlikely(!ns))
1261                ret = -EWOULDBLOCK;
1262        else
1263                ret = nvme_ns_ioctl(ns, cmd, arg);
1264        nvme_put_ns_from_disk(head, srcu_idx);
1265        return ret;
1266}
1267
1268static int nvme_open(struct block_device *bdev, fmode_t mode)
1269{
1270        struct nvme_ns *ns = bdev->bd_disk->private_data;
1271
1272#ifdef CONFIG_NVME_MULTIPATH
1273        /* should never be called due to GENHD_FL_HIDDEN */
1274        if (WARN_ON_ONCE(ns->head->disk))
1275                goto fail;
1276#endif
1277        if (!kref_get_unless_zero(&ns->kref))
1278                goto fail;
1279        if (!try_module_get(ns->ctrl->ops->module))
1280                goto fail_put_ns;
1281
1282        return 0;
1283
1284fail_put_ns:
1285        nvme_put_ns(ns);
1286fail:
1287        return -ENXIO;
1288}
1289
1290static void nvme_release(struct gendisk *disk, fmode_t mode)
1291{
1292        struct nvme_ns *ns = disk->private_data;
1293
1294        module_put(ns->ctrl->ops->module);
1295        nvme_put_ns(ns);
1296}
1297
1298static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1299{
1300        /* some standard values */
1301        geo->heads = 1 << 6;
1302        geo->sectors = 1 << 5;
1303        geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
1304        return 0;
1305}
1306
1307#ifdef CONFIG_BLK_DEV_INTEGRITY
1308static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
1309{
1310        struct blk_integrity integrity;
1311
1312        memset(&integrity, 0, sizeof(integrity));
1313        switch (pi_type) {
1314        case NVME_NS_DPS_PI_TYPE3:
1315                integrity.profile = &t10_pi_type3_crc;
1316                integrity.tag_size = sizeof(u16) + sizeof(u32);
1317                integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1318                break;
1319        case NVME_NS_DPS_PI_TYPE1:
1320        case NVME_NS_DPS_PI_TYPE2:
1321                integrity.profile = &t10_pi_type1_crc;
1322                integrity.tag_size = sizeof(u16);
1323                integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1324                break;
1325        default:
1326                integrity.profile = NULL;
1327                break;
1328        }
1329        integrity.tuple_size = ms;
1330        blk_integrity_register(disk, &integrity);
1331        blk_queue_max_integrity_segments(disk->queue, 1);
1332}
1333#else
1334static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
1335{
1336}
1337#endif /* CONFIG_BLK_DEV_INTEGRITY */
1338
1339static void nvme_set_chunk_size(struct nvme_ns *ns)
1340{
1341        u32 chunk_size = (((u32)ns->noiob) << (ns->lba_shift - 9));
1342        blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size));
1343}
1344
1345static void nvme_config_discard(struct nvme_ctrl *ctrl,
1346                unsigned stream_alignment, struct request_queue *queue)
1347{
1348        u32 size = queue_logical_block_size(queue);
1349
1350        if (stream_alignment)
1351                size *= stream_alignment;
1352
1353        BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
1354                        NVME_DSM_MAX_RANGES);
1355
1356        queue->limits.discard_alignment = 0;
1357        queue->limits.discard_granularity = size;
1358
1359        blk_queue_max_discard_sectors(queue, UINT_MAX);
1360        blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
1361        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, queue);
1362
1363        if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
1364                blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
1365}
1366
1367static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
1368                struct nvme_id_ns *id, struct nvme_ns_ids *ids)
1369{
1370        memset(ids, 0, sizeof(*ids));
1371
1372        if (ctrl->vs >= NVME_VS(1, 1, 0))
1373                memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
1374        if (ctrl->vs >= NVME_VS(1, 2, 0))
1375                memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
1376        if (ctrl->vs >= NVME_VS(1, 3, 0)) {
1377                 /* Don't treat error as fatal we potentially
1378                  * already have a NGUID or EUI-64
1379                  */
1380                if (nvme_identify_ns_descs(ctrl, nsid, ids))
1381                        dev_warn(ctrl->device,
1382                                 "%s: Identify Descriptors failed\n", __func__);
1383        }
1384}
1385
1386static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
1387{
1388        return !uuid_is_null(&ids->uuid) ||
1389                memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) ||
1390                memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
1391}
1392
1393static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
1394{
1395        return uuid_equal(&a->uuid, &b->uuid) &&
1396                memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
1397                memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0;
1398}
1399
1400static void nvme_update_disk_info(struct gendisk *disk,
1401                struct nvme_ns *ns, struct nvme_id_ns *id)
1402{
1403        sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9);
1404        unsigned short bs = 1 << ns->lba_shift;
1405        unsigned stream_alignment = 0;
1406
1407        if (ns->ctrl->nr_streams && ns->sws && ns->sgs)
1408                stream_alignment = ns->sws * ns->sgs;
1409
1410        blk_mq_freeze_queue(disk->queue);
1411        blk_integrity_unregister(disk);
1412
1413        blk_queue_logical_block_size(disk->queue, bs);
1414        blk_queue_physical_block_size(disk->queue, bs);
1415        blk_queue_io_min(disk->queue, bs);
1416
1417        if (ns->ms && !ns->ext &&
1418            (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
1419                nvme_init_integrity(disk, ns->ms, ns->pi_type);
1420        if (ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk))
1421                capacity = 0;
1422        set_capacity(disk, capacity);
1423
1424        if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
1425                nvme_config_discard(ns->ctrl, stream_alignment, disk->queue);
1426        blk_mq_unfreeze_queue(disk->queue);
1427}
1428
1429static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
1430{
1431        struct nvme_ns *ns = disk->private_data;
1432
1433        /*
1434         * If identify namespace failed, use default 512 byte block size so
1435         * block layer can use before failing read/write for 0 capacity.
1436         */
1437        ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
1438        if (ns->lba_shift == 0)
1439                ns->lba_shift = 9;
1440        ns->noiob = le16_to_cpu(id->noiob);
1441        ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
1442        ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
1443        /* the PI implementation requires metadata equal t10 pi tuple size */
1444        if (ns->ms == sizeof(struct t10_pi_tuple))
1445                ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
1446        else
1447                ns->pi_type = 0;
1448
1449        if (ns->noiob)
1450                nvme_set_chunk_size(ns);
1451        nvme_update_disk_info(disk, ns, id);
1452#ifdef CONFIG_NVME_MULTIPATH
1453        if (ns->head->disk)
1454                nvme_update_disk_info(ns->head->disk, ns, id);
1455#endif
1456}
1457
1458static int nvme_revalidate_disk(struct gendisk *disk)
1459{
1460        struct nvme_ns *ns = disk->private_data;
1461        struct nvme_ctrl *ctrl = ns->ctrl;
1462        struct nvme_id_ns *id;
1463        struct nvme_ns_ids ids;
1464        int ret = 0;
1465
1466        if (test_bit(NVME_NS_DEAD, &ns->flags)) {
1467                set_capacity(disk, 0);
1468                return -ENODEV;
1469        }
1470
1471        id = nvme_identify_ns(ctrl, ns->head->ns_id);
1472        if (!id)
1473                return -ENODEV;
1474
1475        if (id->ncap == 0) {
1476                ret = -ENODEV;
1477                goto out;
1478        }
1479
1480        __nvme_revalidate_disk(disk, id);
1481        nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
1482        if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) {
1483                dev_err(ctrl->device,
1484                        "identifiers changed for nsid %d\n", ns->head->ns_id);
1485                ret = -ENODEV;
1486        }
1487
1488out:
1489        kfree(id);
1490        return ret;
1491}
1492
1493static char nvme_pr_type(enum pr_type type)
1494{
1495        switch (type) {
1496        case PR_WRITE_EXCLUSIVE:
1497                return 1;
1498        case PR_EXCLUSIVE_ACCESS:
1499                return 2;
1500        case PR_WRITE_EXCLUSIVE_REG_ONLY:
1501                return 3;
1502        case PR_EXCLUSIVE_ACCESS_REG_ONLY:
1503                return 4;
1504        case PR_WRITE_EXCLUSIVE_ALL_REGS:
1505                return 5;
1506        case PR_EXCLUSIVE_ACCESS_ALL_REGS:
1507                return 6;
1508        default:
1509                return 0;
1510        }
1511};
1512
1513static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
1514                                u64 key, u64 sa_key, u8 op)
1515{
1516        struct nvme_ns_head *head = NULL;
1517        struct nvme_ns *ns;
1518        struct nvme_command c;
1519        int srcu_idx, ret;
1520        u8 data[16] = { 0, };
1521
1522        ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
1523        if (unlikely(!ns))
1524                return -EWOULDBLOCK;
1525
1526        put_unaligned_le64(key, &data[0]);
1527        put_unaligned_le64(sa_key, &data[8]);
1528
1529        memset(&c, 0, sizeof(c));
1530        c.common.opcode = op;
1531        c.common.nsid = cpu_to_le32(ns->head->ns_id);
1532        c.common.cdw10[0] = cpu_to_le32(cdw10);
1533
1534        ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
1535        nvme_put_ns_from_disk(head, srcu_idx);
1536        return ret;
1537}
1538
1539static int nvme_pr_register(struct block_device *bdev, u64 old,
1540                u64 new, unsigned flags)
1541{
1542        u32 cdw10;
1543
1544        if (flags & ~PR_FL_IGNORE_KEY)
1545                return -EOPNOTSUPP;
1546
1547        cdw10 = old ? 2 : 0;
1548        cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
1549        cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
1550        return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
1551}
1552
1553static int nvme_pr_reserve(struct block_device *bdev, u64 key,
1554                enum pr_type type, unsigned flags)
1555{
1556        u32 cdw10;
1557
1558        if (flags & ~PR_FL_IGNORE_KEY)
1559                return -EOPNOTSUPP;
1560
1561        cdw10 = nvme_pr_type(type) << 8;
1562        cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
1563        return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
1564}
1565
1566static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
1567                enum pr_type type, bool abort)
1568{
1569        u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
1570        return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
1571}
1572
1573static int nvme_pr_clear(struct block_device *bdev, u64 key)
1574{
1575        u32 cdw10 = 1 | (key ? 1 << 3 : 0);
1576        return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
1577}
1578
1579static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
1580{
1581        u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
1582        return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
1583}
1584
1585static const struct pr_ops nvme_pr_ops = {
1586        .pr_register    = nvme_pr_register,
1587        .pr_reserve     = nvme_pr_reserve,
1588        .pr_release     = nvme_pr_release,
1589        .pr_preempt     = nvme_pr_preempt,
1590        .pr_clear       = nvme_pr_clear,
1591};
1592
1593#ifdef CONFIG_BLK_SED_OPAL
1594int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
1595                bool send)
1596{
1597        struct nvme_ctrl *ctrl = data;
1598        struct nvme_command cmd;
1599
1600        memset(&cmd, 0, sizeof(cmd));
1601        if (send)
1602                cmd.common.opcode = nvme_admin_security_send;
1603        else
1604                cmd.common.opcode = nvme_admin_security_recv;
1605        cmd.common.nsid = 0;
1606        cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
1607        cmd.common.cdw10[1] = cpu_to_le32(len);
1608
1609        return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
1610                                      ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0);
1611}
1612EXPORT_SYMBOL_GPL(nvme_sec_submit);
1613#endif /* CONFIG_BLK_SED_OPAL */
1614
1615static const struct block_device_operations nvme_fops = {
1616        .owner          = THIS_MODULE,
1617        .ioctl          = nvme_ioctl,
1618        .compat_ioctl   = nvme_ioctl,
1619        .open           = nvme_open,
1620        .release        = nvme_release,
1621        .getgeo         = nvme_getgeo,
1622        .revalidate_disk= nvme_revalidate_disk,
1623        .pr_ops         = &nvme_pr_ops,
1624};
1625
1626#ifdef CONFIG_NVME_MULTIPATH
1627static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode)
1628{
1629        struct nvme_ns_head *head = bdev->bd_disk->private_data;
1630
1631        if (!kref_get_unless_zero(&head->ref))
1632                return -ENXIO;
1633        return 0;
1634}
1635
1636static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode)
1637{
1638        nvme_put_ns_head(disk->private_data);
1639}
1640
1641const struct block_device_operations nvme_ns_head_ops = {
1642        .owner          = THIS_MODULE,
1643        .open           = nvme_ns_head_open,
1644        .release        = nvme_ns_head_release,
1645        .ioctl          = nvme_ioctl,
1646        .compat_ioctl   = nvme_ioctl,
1647        .getgeo         = nvme_getgeo,
1648        .pr_ops         = &nvme_pr_ops,
1649};
1650#endif /* CONFIG_NVME_MULTIPATH */
1651
1652static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
1653{
1654        unsigned long timeout =
1655                ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
1656        u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
1657        int ret;
1658
1659        while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
1660                if (csts == ~0)
1661                        return -ENODEV;
1662                if ((csts & NVME_CSTS_RDY) == bit)
1663                        break;
1664
1665                msleep(100);
1666                if (fatal_signal_pending(current))
1667                        return -EINTR;
1668                if (time_after(jiffies, timeout)) {
1669                        dev_err(ctrl->device,
1670                                "Device not ready; aborting %s\n", enabled ?
1671                                                "initialisation" : "reset");
1672                        return -ENODEV;
1673                }
1674        }
1675
1676        return ret;
1677}
1678
1679/*
1680 * If the device has been passed off to us in an enabled state, just clear
1681 * the enabled bit.  The spec says we should set the 'shutdown notification
1682 * bits', but doing so may cause the device to complete commands to the
1683 * admin queue ... and we don't know what memory that might be pointing at!
1684 */
1685int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
1686{
1687        int ret;
1688
1689        ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
1690        ctrl->ctrl_config &= ~NVME_CC_ENABLE;
1691
1692        ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
1693        if (ret)
1694                return ret;
1695
1696        if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
1697                msleep(NVME_QUIRK_DELAY_AMOUNT);
1698
1699        return nvme_wait_ready(ctrl, cap, false);
1700}
1701EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
1702
1703int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
1704{
1705        /*
1706         * Default to a 4K page size, with the intention to update this
1707         * path in the future to accomodate architectures with differing
1708         * kernel and IO page sizes.
1709         */
1710        unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
1711        int ret;
1712
1713        if (page_shift < dev_page_min) {
1714                dev_err(ctrl->device,
1715                        "Minimum device page size %u too large for host (%u)\n",
1716                        1 << dev_page_min, 1 << page_shift);
1717                return -ENODEV;
1718        }
1719
1720        ctrl->page_size = 1 << page_shift;
1721
1722        ctrl->ctrl_config = NVME_CC_CSS_NVM;
1723        ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
1724        ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
1725        ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
1726        ctrl->ctrl_config |= NVME_CC_ENABLE;
1727
1728        ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
1729        if (ret)
1730                return ret;
1731        return nvme_wait_ready(ctrl, cap, true);
1732}
1733EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
1734
1735int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
1736{
1737        unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ);
1738        u32 csts;
1739        int ret;
1740
1741        ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
1742        ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
1743
1744        ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
1745        if (ret)
1746                return ret;
1747
1748        while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
1749                if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
1750                        break;
1751
1752                msleep(100);
1753                if (fatal_signal_pending(current))
1754                        return -EINTR;
1755                if (time_after(jiffies, timeout)) {
1756                        dev_err(ctrl->device,
1757                                "Device shutdown incomplete; abort shutdown\n");
1758                        return -ENODEV;
1759                }
1760        }
1761
1762        return ret;
1763}
1764EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
1765
1766static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
1767                struct request_queue *q)
1768{
1769        bool vwc = false;
1770
1771        if (ctrl->max_hw_sectors) {
1772                u32 max_segments =
1773                        (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
1774
1775                blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
1776                blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
1777        }
1778        if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
1779            is_power_of_2(ctrl->max_hw_sectors))
1780                blk_queue_chunk_sectors(q, ctrl->max_hw_sectors);
1781        blk_queue_virt_boundary(q, ctrl->page_size - 1);
1782        if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
1783                vwc = true;
1784        blk_queue_write_cache(q, vwc, vwc);
1785}
1786
1787static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
1788{
1789        __le64 ts;
1790        int ret;
1791
1792        if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
1793                return 0;
1794
1795        ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
1796        ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
1797                        NULL);
1798        if (ret)
1799                dev_warn_once(ctrl->device,
1800                        "could not set timestamp (%d)\n", ret);
1801        return ret;
1802}
1803
1804static int nvme_configure_apst(struct nvme_ctrl *ctrl)
1805{
1806        /*
1807         * APST (Autonomous Power State Transition) lets us program a
1808         * table of power state transitions that the controller will
1809         * perform automatically.  We configure it with a simple
1810         * heuristic: we are willing to spend at most 2% of the time
1811         * transitioning between power states.  Therefore, when running
1812         * in any given state, we will enter the next lower-power
1813         * non-operational state after waiting 50 * (enlat + exlat)
1814         * microseconds, as long as that state's exit latency is under
1815         * the requested maximum latency.
1816         *
1817         * We will not autonomously enter any non-operational state for
1818         * which the total latency exceeds ps_max_latency_us.  Users
1819         * can set ps_max_latency_us to zero to turn off APST.
1820         */
1821
1822        unsigned apste;
1823        struct nvme_feat_auto_pst *table;
1824        u64 max_lat_us = 0;
1825        int max_ps = -1;
1826        int ret;
1827
1828        /*
1829         * If APST isn't supported or if we haven't been initialized yet,
1830         * then don't do anything.
1831         */
1832        if (!ctrl->apsta)
1833                return 0;
1834
1835        if (ctrl->npss > 31) {
1836                dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
1837                return 0;
1838        }
1839
1840        table = kzalloc(sizeof(*table), GFP_KERNEL);
1841        if (!table)
1842                return 0;
1843
1844        if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
1845                /* Turn off APST. */
1846                apste = 0;
1847                dev_dbg(ctrl->device, "APST disabled\n");
1848        } else {
1849                __le64 target = cpu_to_le64(0);
1850                int state;
1851
1852                /*
1853                 * Walk through all states from lowest- to highest-power.
1854                 * According to the spec, lower-numbered states use more
1855                 * power.  NPSS, despite the name, is the index of the
1856                 * lowest-power state, not the number of states.
1857                 */
1858                for (state = (int)ctrl->npss; state >= 0; state--) {
1859                        u64 total_latency_us, exit_latency_us, transition_ms;
1860
1861                        if (target)
1862                                table->entries[state] = target;
1863
1864                        /*
1865                         * Don't allow transitions to the deepest state
1866                         * if it's quirked off.
1867                         */
1868                        if (state == ctrl->npss &&
1869                            (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
1870                                continue;
1871
1872                        /*
1873                         * Is this state a useful non-operational state for
1874                         * higher-power states to autonomously transition to?
1875                         */
1876                        if (!(ctrl->psd[state].flags &
1877                              NVME_PS_FLAGS_NON_OP_STATE))
1878                                continue;
1879
1880                        exit_latency_us =
1881                                (u64)le32_to_cpu(ctrl->psd[state].exit_lat);
1882                        if (exit_latency_us > ctrl->ps_max_latency_us)
1883                                continue;
1884
1885                        total_latency_us =
1886                                exit_latency_us +
1887                                le32_to_cpu(ctrl->psd[state].entry_lat);
1888
1889                        /*
1890                         * This state is good.  Use it as the APST idle
1891                         * target for higher power states.
1892                         */
1893                        transition_ms = total_latency_us + 19;
1894                        do_div(transition_ms, 20);
1895                        if (transition_ms > (1 << 24) - 1)
1896                                transition_ms = (1 << 24) - 1;
1897
1898                        target = cpu_to_le64((state << 3) |
1899                                             (transition_ms << 8));
1900
1901                        if (max_ps == -1)
1902                                max_ps = state;
1903
1904                        if (total_latency_us > max_lat_us)
1905                                max_lat_us = total_latency_us;
1906                }
1907
1908                apste = 1;
1909
1910                if (max_ps == -1) {
1911                        dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
1912                } else {
1913                        dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
1914                                max_ps, max_lat_us, (int)sizeof(*table), table);
1915                }
1916        }
1917
1918        ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
1919                                table, sizeof(*table), NULL);
1920        if (ret)
1921                dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
1922
1923        kfree(table);
1924        return ret;
1925}
1926
1927static void nvme_set_latency_tolerance(struct device *dev, s32 val)
1928{
1929        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
1930        u64 latency;
1931
1932        switch (val) {
1933        case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
1934        case PM_QOS_LATENCY_ANY:
1935                latency = U64_MAX;
1936                break;
1937
1938        default:
1939                latency = val;
1940        }
1941
1942        if (ctrl->ps_max_latency_us != latency) {
1943                ctrl->ps_max_latency_us = latency;
1944                nvme_configure_apst(ctrl);
1945        }
1946}
1947
1948struct nvme_core_quirk_entry {
1949        /*
1950         * NVMe model and firmware strings are padded with spaces.  For
1951         * simplicity, strings in the quirk table are padded with NULLs
1952         * instead.
1953         */
1954        u16 vid;
1955        const char *mn;
1956        const char *fr;
1957        unsigned long quirks;
1958};
1959
1960static const struct nvme_core_quirk_entry core_quirks[] = {
1961        {
1962                /*
1963                 * This Toshiba device seems to die using any APST states.  See:
1964                 * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
1965                 */
1966                .vid = 0x1179,
1967                .mn = "THNSF5256GPUK TOSHIBA",
1968                .quirks = NVME_QUIRK_NO_APST,
1969        }
1970};
1971
1972/* match is null-terminated but idstr is space-padded. */
1973static bool string_matches(const char *idstr, const char *match, size_t len)
1974{
1975        size_t matchlen;
1976
1977        if (!match)
1978                return true;
1979
1980        matchlen = strlen(match);
1981        WARN_ON_ONCE(matchlen > len);
1982
1983        if (memcmp(idstr, match, matchlen))
1984                return false;
1985
1986        for (; matchlen < len; matchlen++)
1987                if (idstr[matchlen] != ' ')
1988                        return false;
1989
1990        return true;
1991}
1992
1993static bool quirk_matches(const struct nvme_id_ctrl *id,
1994                          const struct nvme_core_quirk_entry *q)
1995{
1996        return q->vid == le16_to_cpu(id->vid) &&
1997                string_matches(id->mn, q->mn, sizeof(id->mn)) &&
1998                string_matches(id->fr, q->fr, sizeof(id->fr));
1999}
2000
2001static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
2002                struct nvme_id_ctrl *id)
2003{
2004        size_t nqnlen;
2005        int off;
2006
2007        nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
2008        if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
2009                strncpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
2010                return;
2011        }
2012
2013        if (ctrl->vs >= NVME_VS(1, 2, 1))
2014                dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
2015
2016        /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
2017        off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
2018                        "nqn.2014.08.org.nvmexpress:%4x%4x",
2019                        le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
2020        memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
2021        off += sizeof(id->sn);
2022        memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
2023        off += sizeof(id->mn);
2024        memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
2025}
2026
2027static void __nvme_release_subsystem(struct nvme_subsystem *subsys)
2028{
2029        ida_simple_remove(&nvme_subsystems_ida, subsys->instance);
2030        kfree(subsys);
2031}
2032
2033static void nvme_release_subsystem(struct device *dev)
2034{
2035        __nvme_release_subsystem(container_of(dev, struct nvme_subsystem, dev));
2036}
2037
2038static void nvme_destroy_subsystem(struct kref *ref)
2039{
2040        struct nvme_subsystem *subsys =
2041                        container_of(ref, struct nvme_subsystem, ref);
2042
2043        mutex_lock(&nvme_subsystems_lock);
2044        list_del(&subsys->entry);
2045        mutex_unlock(&nvme_subsystems_lock);
2046
2047        ida_destroy(&subsys->ns_ida);
2048        device_del(&subsys->dev);
2049        put_device(&subsys->dev);
2050}
2051
2052static void nvme_put_subsystem(struct nvme_subsystem *subsys)
2053{
2054        kref_put(&subsys->ref, nvme_destroy_subsystem);
2055}
2056
2057static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
2058{
2059        struct nvme_subsystem *subsys;
2060
2061        lockdep_assert_held(&nvme_subsystems_lock);
2062
2063        list_for_each_entry(subsys, &nvme_subsystems, entry) {
2064                if (strcmp(subsys->subnqn, subsysnqn))
2065                        continue;
2066                if (!kref_get_unless_zero(&subsys->ref))
2067                        continue;
2068                return subsys;
2069        }
2070
2071        return NULL;
2072}
2073
2074#define SUBSYS_ATTR_RO(_name, _mode, _show)                     \
2075        struct device_attribute subsys_attr_##_name = \
2076                __ATTR(_name, _mode, _show, NULL)
2077
2078static ssize_t nvme_subsys_show_nqn(struct device *dev,
2079                                    struct device_attribute *attr,
2080                                    char *buf)
2081{
2082        struct nvme_subsystem *subsys =
2083                container_of(dev, struct nvme_subsystem, dev);
2084
2085        return snprintf(buf, PAGE_SIZE, "%s\n", subsys->subnqn);
2086}
2087static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn);
2088
2089#define nvme_subsys_show_str_function(field)                            \
2090static ssize_t subsys_##field##_show(struct device *dev,                \
2091                            struct device_attribute *attr, char *buf)   \
2092{                                                                       \
2093        struct nvme_subsystem *subsys =                                 \
2094                container_of(dev, struct nvme_subsystem, dev);          \
2095        return sprintf(buf, "%.*s\n",                                   \
2096                       (int)sizeof(subsys->field), subsys->field);      \
2097}                                                                       \
2098static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show);
2099
2100nvme_subsys_show_str_function(model);
2101nvme_subsys_show_str_function(serial);
2102nvme_subsys_show_str_function(firmware_rev);
2103
2104static struct attribute *nvme_subsys_attrs[] = {
2105        &subsys_attr_model.attr,
2106        &subsys_attr_serial.attr,
2107        &subsys_attr_firmware_rev.attr,
2108        &subsys_attr_subsysnqn.attr,
2109        NULL,
2110};
2111
2112static struct attribute_group nvme_subsys_attrs_group = {
2113        .attrs = nvme_subsys_attrs,
2114};
2115
2116static const struct attribute_group *nvme_subsys_attrs_groups[] = {
2117        &nvme_subsys_attrs_group,
2118        NULL,
2119};
2120
2121static int nvme_active_ctrls(struct nvme_subsystem *subsys)
2122{
2123        int count = 0;
2124        struct nvme_ctrl *ctrl;
2125
2126        mutex_lock(&subsys->lock);
2127        list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
2128                if (ctrl->state != NVME_CTRL_DELETING &&
2129                    ctrl->state != NVME_CTRL_DEAD)
2130                        count++;
2131        }
2132        mutex_unlock(&subsys->lock);
2133
2134        return count;
2135}
2136
2137static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2138{
2139        struct nvme_subsystem *subsys, *found;
2140        int ret;
2141
2142        subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
2143        if (!subsys)
2144                return -ENOMEM;
2145        ret = ida_simple_get(&nvme_subsystems_ida, 0, 0, GFP_KERNEL);
2146        if (ret < 0) {
2147                kfree(subsys);
2148                return ret;
2149        }
2150        subsys->instance = ret;
2151        mutex_init(&subsys->lock);
2152        kref_init(&subsys->ref);
2153        INIT_LIST_HEAD(&subsys->ctrls);
2154        INIT_LIST_HEAD(&subsys->nsheads);
2155        nvme_init_subnqn(subsys, ctrl, id);
2156        memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
2157        memcpy(subsys->model, id->mn, sizeof(subsys->model));
2158        memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
2159        subsys->vendor_id = le16_to_cpu(id->vid);
2160        subsys->cmic = id->cmic;
2161
2162        subsys->dev.class = nvme_subsys_class;
2163        subsys->dev.release = nvme_release_subsystem;
2164        subsys->dev.groups = nvme_subsys_attrs_groups;
2165        dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance);
2166        device_initialize(&subsys->dev);
2167
2168        mutex_lock(&nvme_subsystems_lock);
2169        found = __nvme_find_get_subsystem(subsys->subnqn);
2170        if (found) {
2171                /*
2172                 * Verify that the subsystem actually supports multiple
2173                 * controllers, else bail out.
2174                 */
2175                if (nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) {
2176                        dev_err(ctrl->device,
2177                                "ignoring ctrl due to duplicate subnqn (%s).\n",
2178                                found->subnqn);
2179                        nvme_put_subsystem(found);
2180                        ret = -EINVAL;
2181                        goto out_unlock;
2182                }
2183
2184                __nvme_release_subsystem(subsys);
2185                subsys = found;
2186        } else {
2187                ret = device_add(&subsys->dev);
2188                if (ret) {
2189                        dev_err(ctrl->device,
2190                                "failed to register subsystem device.\n");
2191                        goto out_unlock;
2192                }
2193                ida_init(&subsys->ns_ida);
2194                list_add_tail(&subsys->entry, &nvme_subsystems);
2195        }
2196
2197        ctrl->subsys = subsys;
2198        mutex_unlock(&nvme_subsystems_lock);
2199
2200        if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
2201                        dev_name(ctrl->device))) {
2202                dev_err(ctrl->device,
2203                        "failed to create sysfs link from subsystem.\n");
2204                /* the transport driver will eventually put the subsystem */
2205                return -EINVAL;
2206        }
2207
2208        mutex_lock(&subsys->lock);
2209        list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
2210        mutex_unlock(&subsys->lock);
2211
2212        return 0;
2213
2214out_unlock:
2215        mutex_unlock(&nvme_subsystems_lock);
2216        put_device(&subsys->dev);
2217        return ret;
2218}
2219
2220static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log,
2221                        size_t size)
2222{
2223        struct nvme_command c = { };
2224
2225        c.common.opcode = nvme_admin_get_log_page;
2226        c.common.nsid = cpu_to_le32(NVME_NSID_ALL);
2227        c.common.cdw10[0] = nvme_get_log_dw10(log_page, size);
2228
2229        return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
2230}
2231
2232static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
2233{
2234        int ret;
2235
2236        if (!ctrl->effects)
2237                ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
2238
2239        if (!ctrl->effects)
2240                return 0;
2241
2242        ret = nvme_get_log(ctrl, NVME_LOG_CMD_EFFECTS, ctrl->effects,
2243                                        sizeof(*ctrl->effects));
2244        if (ret) {
2245                kfree(ctrl->effects);
2246                ctrl->effects = NULL;
2247        }
2248        return ret;
2249}
2250
2251/*
2252 * Initialize the cached copies of the Identify data and various controller
2253 * register in our nvme_ctrl structure.  This should be called as soon as
2254 * the admin queue is fully up and running.
2255 */
2256int nvme_init_identify(struct nvme_ctrl *ctrl)
2257{
2258        struct nvme_id_ctrl *id;
2259        u64 cap;
2260        int ret, page_shift;
2261        u32 max_hw_sectors;
2262        bool prev_apst_enabled;
2263
2264        ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
2265        if (ret) {
2266                dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
2267                return ret;
2268        }
2269
2270        ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
2271        if (ret) {
2272                dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
2273                return ret;
2274        }
2275        page_shift = NVME_CAP_MPSMIN(cap) + 12;
2276
2277        if (ctrl->vs >= NVME_VS(1, 1, 0))
2278                ctrl->subsystem = NVME_CAP_NSSRC(cap);
2279
2280        ret = nvme_identify_ctrl(ctrl, &id);
2281        if (ret) {
2282                dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
2283                return -EIO;
2284        }
2285
2286        if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
2287                ret = nvme_get_effects_log(ctrl);
2288                if (ret < 0)
2289                        return ret;
2290        }
2291
2292        if (!ctrl->identified) {
2293                int i;
2294
2295                ret = nvme_init_subsystem(ctrl, id);
2296                if (ret)
2297                        goto out_free;
2298
2299                /*
2300                 * Check for quirks.  Quirk can depend on firmware version,
2301                 * so, in principle, the set of quirks present can change
2302                 * across a reset.  As a possible future enhancement, we
2303                 * could re-scan for quirks every time we reinitialize
2304                 * the device, but we'd have to make sure that the driver
2305                 * behaves intelligently if the quirks change.
2306                 */
2307                for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
2308                        if (quirk_matches(id, &core_quirks[i]))
2309                                ctrl->quirks |= core_quirks[i].quirks;
2310                }
2311        }
2312
2313        if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
2314                dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
2315                ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
2316        }
2317
2318        ctrl->oacs = le16_to_cpu(id->oacs);
2319        ctrl->oncs = le16_to_cpup(&id->oncs);
2320        atomic_set(&ctrl->abort_limit, id->acl + 1);
2321        ctrl->vwc = id->vwc;
2322        ctrl->cntlid = le16_to_cpup(&id->cntlid);
2323        if (id->mdts)
2324                max_hw_sectors = 1 << (id->mdts + page_shift - 9);
2325        else
2326                max_hw_sectors = UINT_MAX;
2327        ctrl->max_hw_sectors =
2328                min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
2329
2330        nvme_set_queue_limits(ctrl, ctrl->admin_q);
2331        ctrl->sgls = le32_to_cpu(id->sgls);
2332        ctrl->kas = le16_to_cpu(id->kas);
2333
2334        if (id->rtd3e) {
2335                /* us -> s */
2336                u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000;
2337
2338                ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
2339                                                 shutdown_timeout, 60);
2340
2341                if (ctrl->shutdown_timeout != shutdown_timeout)
2342                        dev_info(ctrl->device,
2343                                 "Shutdown timeout set to %u seconds\n",
2344                                 ctrl->shutdown_timeout);
2345        } else
2346                ctrl->shutdown_timeout = shutdown_timeout;
2347
2348        ctrl->npss = id->npss;
2349        ctrl->apsta = id->apsta;
2350        prev_apst_enabled = ctrl->apst_enabled;
2351        if (ctrl->quirks & NVME_QUIRK_NO_APST) {
2352                if (force_apst && id->apsta) {
2353                        dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
2354                        ctrl->apst_enabled = true;
2355                } else {
2356                        ctrl->apst_enabled = false;
2357                }
2358        } else {
2359                ctrl->apst_enabled = id->apsta;
2360        }
2361        memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
2362
2363        if (ctrl->ops->flags & NVME_F_FABRICS) {
2364                ctrl->icdoff = le16_to_cpu(id->icdoff);
2365                ctrl->ioccsz = le32_to_cpu(id->ioccsz);
2366                ctrl->iorcsz = le32_to_cpu(id->iorcsz);
2367                ctrl->maxcmd = le16_to_cpu(id->maxcmd);
2368
2369                /*
2370                 * In fabrics we need to verify the cntlid matches the
2371                 * admin connect
2372                 */
2373                if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
2374                        ret = -EINVAL;
2375                        goto out_free;
2376                }
2377
2378                if (!ctrl->opts->discovery_nqn && !ctrl->kas) {
2379                        dev_err(ctrl->device,
2380                                "keep-alive support is mandatory for fabrics\n");
2381                        ret = -EINVAL;
2382                        goto out_free;
2383                }
2384        } else {
2385                ctrl->cntlid = le16_to_cpu(id->cntlid);
2386                ctrl->hmpre = le32_to_cpu(id->hmpre);
2387                ctrl->hmmin = le32_to_cpu(id->hmmin);
2388                ctrl->hmminds = le32_to_cpu(id->hmminds);
2389                ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
2390        }
2391
2392        kfree(id);
2393
2394        if (ctrl->apst_enabled && !prev_apst_enabled)
2395                dev_pm_qos_expose_latency_tolerance(ctrl->device);
2396        else if (!ctrl->apst_enabled && prev_apst_enabled)
2397                dev_pm_qos_hide_latency_tolerance(ctrl->device);
2398
2399        ret = nvme_configure_apst(ctrl);
2400        if (ret < 0)
2401                return ret;
2402        
2403        ret = nvme_configure_timestamp(ctrl);
2404        if (ret < 0)
2405                return ret;
2406
2407        ret = nvme_configure_directives(ctrl);
2408        if (ret < 0)
2409                return ret;
2410
2411        ctrl->identified = true;
2412
2413        return 0;
2414
2415out_free:
2416        kfree(id);
2417        return ret;
2418}
2419EXPORT_SYMBOL_GPL(nvme_init_identify);
2420
2421static int nvme_dev_open(struct inode *inode, struct file *file)
2422{
2423        struct nvme_ctrl *ctrl =
2424                container_of(inode->i_cdev, struct nvme_ctrl, cdev);
2425
2426        switch (ctrl->state) {
2427        case NVME_CTRL_LIVE:
2428        case NVME_CTRL_ADMIN_ONLY:
2429                break;
2430        default:
2431                return -EWOULDBLOCK;
2432        }
2433
2434        file->private_data = ctrl;
2435        return 0;
2436}
2437
2438static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
2439{
2440        struct nvme_ns *ns;
2441        int ret;
2442
2443        mutex_lock(&ctrl->namespaces_mutex);
2444        if (list_empty(&ctrl->namespaces)) {
2445                ret = -ENOTTY;
2446                goto out_unlock;
2447        }
2448
2449        ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
2450        if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
2451                dev_warn(ctrl->device,
2452                        "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
2453                ret = -EINVAL;
2454                goto out_unlock;
2455        }
2456
2457        dev_warn(ctrl->device,
2458                "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
2459        kref_get(&ns->kref);
2460        mutex_unlock(&ctrl->namespaces_mutex);
2461
2462        ret = nvme_user_cmd(ctrl, ns, argp);
2463        nvme_put_ns(ns);
2464        return ret;
2465
2466out_unlock:
2467        mutex_unlock(&ctrl->namespaces_mutex);
2468        return ret;
2469}
2470
2471static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
2472                unsigned long arg)
2473{
2474        struct nvme_ctrl *ctrl = file->private_data;
2475        void __user *argp = (void __user *)arg;
2476
2477        switch (cmd) {
2478        case NVME_IOCTL_ADMIN_CMD:
2479                return nvme_user_cmd(ctrl, NULL, argp);
2480        case NVME_IOCTL_IO_CMD:
2481                return nvme_dev_user_cmd(ctrl, argp);
2482        case NVME_IOCTL_RESET:
2483                dev_warn(ctrl->device, "resetting controller\n");
2484                return nvme_reset_ctrl_sync(ctrl);
2485        case NVME_IOCTL_SUBSYS_RESET:
2486                return nvme_reset_subsystem(ctrl);
2487        case NVME_IOCTL_RESCAN:
2488                nvme_queue_scan(ctrl);
2489                return 0;
2490        default:
2491                return -ENOTTY;
2492        }
2493}
2494
2495static const struct file_operations nvme_dev_fops = {
2496        .owner          = THIS_MODULE,
2497        .open           = nvme_dev_open,
2498        .unlocked_ioctl = nvme_dev_ioctl,
2499        .compat_ioctl   = nvme_dev_ioctl,
2500};
2501
2502static ssize_t nvme_sysfs_reset(struct device *dev,
2503                                struct device_attribute *attr, const char *buf,
2504                                size_t count)
2505{
2506        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2507        int ret;
2508
2509        ret = nvme_reset_ctrl_sync(ctrl);
2510        if (ret < 0)
2511                return ret;
2512        return count;
2513}
2514static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
2515
2516static ssize_t nvme_sysfs_rescan(struct device *dev,
2517                                struct device_attribute *attr, const char *buf,
2518                                size_t count)
2519{
2520        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2521
2522        nvme_queue_scan(ctrl);
2523        return count;
2524}
2525static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
2526
2527static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev)
2528{
2529        struct gendisk *disk = dev_to_disk(dev);
2530
2531        if (disk->fops == &nvme_fops)
2532                return nvme_get_ns_from_dev(dev)->head;
2533        else
2534                return disk->private_data;
2535}
2536
2537static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
2538                char *buf)
2539{
2540        struct nvme_ns_head *head = dev_to_ns_head(dev);
2541        struct nvme_ns_ids *ids = &head->ids;
2542        struct nvme_subsystem *subsys = head->subsys;
2543        int serial_len = sizeof(subsys->serial);
2544        int model_len = sizeof(subsys->model);
2545
2546        if (!uuid_is_null(&ids->uuid))
2547                return sprintf(buf, "uuid.%pU\n", &ids->uuid);
2548
2549        if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
2550                return sprintf(buf, "eui.%16phN\n", ids->nguid);
2551
2552        if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
2553                return sprintf(buf, "eui.%8phN\n", ids->eui64);
2554
2555        while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' ||
2556                                  subsys->serial[serial_len - 1] == '\0'))
2557                serial_len--;
2558        while (model_len > 0 && (subsys->model[model_len - 1] == ' ' ||
2559                                 subsys->model[model_len - 1] == '\0'))
2560                model_len--;
2561
2562        return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
2563                serial_len, subsys->serial, model_len, subsys->model,
2564                head->ns_id);
2565}
2566static DEVICE_ATTR_RO(wwid);
2567
2568static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
2569                char *buf)
2570{
2571        return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);
2572}
2573static DEVICE_ATTR_RO(nguid);
2574
2575static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
2576                char *buf)
2577{
2578        struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
2579
2580        /* For backward compatibility expose the NGUID to userspace if
2581         * we have no UUID set
2582         */
2583        if (uuid_is_null(&ids->uuid)) {
2584                printk_ratelimited(KERN_WARNING
2585                                   "No UUID available providing old NGUID\n");
2586                return sprintf(buf, "%pU\n", ids->nguid);
2587        }
2588        return sprintf(buf, "%pU\n", &ids->uuid);
2589}
2590static DEVICE_ATTR_RO(uuid);
2591
2592static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
2593                char *buf)
2594{
2595        return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);
2596}
2597static DEVICE_ATTR_RO(eui);
2598
2599static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
2600                char *buf)
2601{
2602        return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id);
2603}
2604static DEVICE_ATTR_RO(nsid);
2605
2606static struct attribute *nvme_ns_id_attrs[] = {
2607        &dev_attr_wwid.attr,
2608        &dev_attr_uuid.attr,
2609        &dev_attr_nguid.attr,
2610        &dev_attr_eui.attr,
2611        &dev_attr_nsid.attr,
2612        NULL,
2613};
2614
2615static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
2616                struct attribute *a, int n)
2617{
2618        struct device *dev = container_of(kobj, struct device, kobj);
2619        struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
2620
2621        if (a == &dev_attr_uuid.attr) {
2622                if (uuid_is_null(&ids->uuid) &&
2623                    !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
2624                        return 0;
2625        }
2626        if (a == &dev_attr_nguid.attr) {
2627                if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
2628                        return 0;
2629        }
2630        if (a == &dev_attr_eui.attr) {
2631                if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
2632                        return 0;
2633        }
2634        return a->mode;
2635}
2636
2637const struct attribute_group nvme_ns_id_attr_group = {
2638        .attrs          = nvme_ns_id_attrs,
2639        .is_visible     = nvme_ns_id_attrs_are_visible,
2640};
2641
2642#define nvme_show_str_function(field)                                           \
2643static ssize_t  field##_show(struct device *dev,                                \
2644                            struct device_attribute *attr, char *buf)           \
2645{                                                                               \
2646        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);                          \
2647        return sprintf(buf, "%.*s\n",                                           \
2648                (int)sizeof(ctrl->subsys->field), ctrl->subsys->field);         \
2649}                                                                               \
2650static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
2651
2652nvme_show_str_function(model);
2653nvme_show_str_function(serial);
2654nvme_show_str_function(firmware_rev);
2655
2656#define nvme_show_int_function(field)                                           \
2657static ssize_t  field##_show(struct device *dev,                                \
2658                            struct device_attribute *attr, char *buf)           \
2659{                                                                               \
2660        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);                          \
2661        return sprintf(buf, "%d\n", ctrl->field);       \
2662}                                                                               \
2663static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
2664
2665nvme_show_int_function(cntlid);
2666
2667static ssize_t nvme_sysfs_delete(struct device *dev,
2668                                struct device_attribute *attr, const char *buf,
2669                                size_t count)
2670{
2671        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2672
2673        if (device_remove_file_self(dev, attr))
2674                nvme_delete_ctrl_sync(ctrl);
2675        return count;
2676}
2677static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
2678
2679static ssize_t nvme_sysfs_show_transport(struct device *dev,
2680                                         struct device_attribute *attr,
2681                                         char *buf)
2682{
2683        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2684
2685        return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name);
2686}
2687static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
2688
2689static ssize_t nvme_sysfs_show_state(struct device *dev,
2690                                     struct device_attribute *attr,
2691                                     char *buf)
2692{
2693        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2694        static const char *const state_name[] = {
2695                [NVME_CTRL_NEW]         = "new",
2696                [NVME_CTRL_LIVE]        = "live",
2697                [NVME_CTRL_ADMIN_ONLY]  = "only-admin",
2698                [NVME_CTRL_RESETTING]   = "resetting",
2699                [NVME_CTRL_CONNECTING]  = "connecting",
2700                [NVME_CTRL_DELETING]    = "deleting",
2701                [NVME_CTRL_DEAD]        = "dead",
2702        };
2703
2704        if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
2705            state_name[ctrl->state])
2706                return sprintf(buf, "%s\n", state_name[ctrl->state]);
2707
2708        return sprintf(buf, "unknown state\n");
2709}
2710
2711static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
2712
2713static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
2714                                         struct device_attribute *attr,
2715                                         char *buf)
2716{
2717        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2718
2719        return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subsys->subnqn);
2720}
2721static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
2722
2723static ssize_t nvme_sysfs_show_address(struct device *dev,
2724                                         struct device_attribute *attr,
2725                                         char *buf)
2726{
2727        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2728
2729        return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE);
2730}
2731static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
2732
2733static struct attribute *nvme_dev_attrs[] = {
2734        &dev_attr_reset_controller.attr,
2735        &dev_attr_rescan_controller.attr,
2736        &dev_attr_model.attr,
2737        &dev_attr_serial.attr,
2738        &dev_attr_firmware_rev.attr,
2739        &dev_attr_cntlid.attr,
2740        &dev_attr_delete_controller.attr,
2741        &dev_attr_transport.attr,
2742        &dev_attr_subsysnqn.attr,
2743        &dev_attr_address.attr,
2744        &dev_attr_state.attr,
2745        NULL
2746};
2747
2748static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
2749                struct attribute *a, int n)
2750{
2751        struct device *dev = container_of(kobj, struct device, kobj);
2752        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2753
2754        if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl)
2755                return 0;
2756        if (a == &dev_attr_address.attr && !ctrl->ops->get_address)
2757                return 0;
2758
2759        return a->mode;
2760}
2761
2762static struct attribute_group nvme_dev_attrs_group = {
2763        .attrs          = nvme_dev_attrs,
2764        .is_visible     = nvme_dev_attrs_are_visible,
2765};
2766
2767static const struct attribute_group *nvme_dev_attr_groups[] = {
2768        &nvme_dev_attrs_group,
2769        NULL,
2770};
2771
2772static struct nvme_ns_head *__nvme_find_ns_head(struct nvme_subsystem *subsys,
2773                unsigned nsid)
2774{
2775        struct nvme_ns_head *h;
2776
2777        lockdep_assert_held(&subsys->lock);
2778
2779        list_for_each_entry(h, &subsys->nsheads, entry) {
2780                if (h->ns_id == nsid && kref_get_unless_zero(&h->ref))
2781                        return h;
2782        }
2783
2784        return NULL;
2785}
2786
2787static int __nvme_check_ids(struct nvme_subsystem *subsys,
2788                struct nvme_ns_head *new)
2789{
2790        struct nvme_ns_head *h;
2791
2792        lockdep_assert_held(&subsys->lock);
2793
2794        list_for_each_entry(h, &subsys->nsheads, entry) {
2795                if (nvme_ns_ids_valid(&new->ids) &&
2796                    nvme_ns_ids_equal(&new->ids, &h->ids))
2797                        return -EINVAL;
2798        }
2799
2800        return 0;
2801}
2802
2803static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
2804                unsigned nsid, struct nvme_id_ns *id)
2805{
2806        struct nvme_ns_head *head;
2807        int ret = -ENOMEM;
2808
2809        head = kzalloc(sizeof(*head), GFP_KERNEL);
2810        if (!head)
2811                goto out;
2812        ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL);
2813        if (ret < 0)
2814                goto out_free_head;
2815        head->instance = ret;
2816        INIT_LIST_HEAD(&head->list);
2817        init_srcu_struct(&head->srcu);
2818        head->subsys = ctrl->subsys;
2819        head->ns_id = nsid;
2820        kref_init(&head->ref);
2821
2822        nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
2823
2824        ret = __nvme_check_ids(ctrl->subsys, head);
2825        if (ret) {
2826                dev_err(ctrl->device,
2827                        "duplicate IDs for nsid %d\n", nsid);
2828                goto out_cleanup_srcu;
2829        }
2830
2831        ret = nvme_mpath_alloc_disk(ctrl, head);
2832        if (ret)
2833                goto out_cleanup_srcu;
2834
2835        list_add_tail(&head->entry, &ctrl->subsys->nsheads);
2836        return head;
2837out_cleanup_srcu:
2838        cleanup_srcu_struct(&head->srcu);
2839        ida_simple_remove(&ctrl->subsys->ns_ida, head->instance);
2840out_free_head:
2841        kfree(head);
2842out:
2843        return ERR_PTR(ret);
2844}
2845
2846static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
2847                struct nvme_id_ns *id)
2848{
2849        struct nvme_ctrl *ctrl = ns->ctrl;
2850        bool is_shared = id->nmic & (1 << 0);
2851        struct nvme_ns_head *head = NULL;
2852        int ret = 0;
2853
2854        mutex_lock(&ctrl->subsys->lock);
2855        if (is_shared)
2856                head = __nvme_find_ns_head(ctrl->subsys, nsid);
2857        if (!head) {
2858                head = nvme_alloc_ns_head(ctrl, nsid, id);
2859                if (IS_ERR(head)) {
2860                        ret = PTR_ERR(head);
2861                        goto out_unlock;
2862                }
2863        } else {
2864                struct nvme_ns_ids ids;
2865
2866                nvme_report_ns_ids(ctrl, nsid, id, &ids);
2867                if (!nvme_ns_ids_equal(&head->ids, &ids)) {
2868                        dev_err(ctrl->device,
2869                                "IDs don't match for shared namespace %d\n",
2870                                        nsid);
2871                        ret = -EINVAL;
2872                        goto out_unlock;
2873                }
2874        }
2875
2876        list_add_tail(&ns->siblings, &head->list);
2877        ns->head = head;
2878
2879out_unlock:
2880        mutex_unlock(&ctrl->subsys->lock);
2881        return ret;
2882}
2883
2884static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
2885{
2886        struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
2887        struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
2888
2889        return nsa->head->ns_id - nsb->head->ns_id;
2890}
2891
2892static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
2893{
2894        struct nvme_ns *ns, *ret = NULL;
2895
2896        mutex_lock(&ctrl->namespaces_mutex);
2897        list_for_each_entry(ns, &ctrl->namespaces, list) {
2898                if (ns->head->ns_id == nsid) {
2899                        if (!kref_get_unless_zero(&ns->kref))
2900                                continue;
2901                        ret = ns;
2902                        break;
2903                }
2904                if (ns->head->ns_id > nsid)
2905                        break;
2906        }
2907        mutex_unlock(&ctrl->namespaces_mutex);
2908        return ret;
2909}
2910
2911static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
2912{
2913        struct streams_directive_params s;
2914        int ret;
2915
2916        if (!ctrl->nr_streams)
2917                return 0;
2918
2919        ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id);
2920        if (ret)
2921                return ret;
2922
2923        ns->sws = le32_to_cpu(s.sws);
2924        ns->sgs = le16_to_cpu(s.sgs);
2925
2926        if (ns->sws) {
2927                unsigned int bs = 1 << ns->lba_shift;
2928
2929                blk_queue_io_min(ns->queue, bs * ns->sws);
2930                if (ns->sgs)
2931                        blk_queue_io_opt(ns->queue, bs * ns->sws * ns->sgs);
2932        }
2933
2934        return 0;
2935}
2936
2937static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
2938{
2939        struct nvme_ns *ns;
2940        struct gendisk *disk;
2941        struct nvme_id_ns *id;
2942        char disk_name[DISK_NAME_LEN];
2943        int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
2944
2945        ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
2946        if (!ns)
2947                return;
2948
2949        ns->queue = blk_mq_init_queue(ctrl->tagset);
2950        if (IS_ERR(ns->queue))
2951                goto out_free_ns;
2952        queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
2953        ns->queue->queuedata = ns;
2954        ns->ctrl = ctrl;
2955
2956        kref_init(&ns->kref);
2957        ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
2958
2959        blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
2960        nvme_set_queue_limits(ctrl, ns->queue);
2961
2962        id = nvme_identify_ns(ctrl, nsid);
2963        if (!id)
2964                goto out_free_queue;
2965
2966        if (id->ncap == 0)
2967                goto out_free_id;
2968
2969        if (nvme_init_ns_head(ns, nsid, id))
2970                goto out_free_id;
2971        nvme_setup_streams_ns(ctrl, ns);
2972        
2973#ifdef CONFIG_NVME_MULTIPATH
2974        /*
2975         * If multipathing is enabled we need to always use the subsystem
2976         * instance number for numbering our devices to avoid conflicts
2977         * between subsystems that have multiple controllers and thus use
2978         * the multipath-aware subsystem node and those that have a single
2979         * controller and use the controller node directly.
2980         */
2981        if (ns->head->disk) {
2982                sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
2983                                ctrl->cntlid, ns->head->instance);
2984                flags = GENHD_FL_HIDDEN;
2985        } else {
2986                sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance,
2987                                ns->head->instance);
2988        }
2989#else
2990        /*
2991         * But without the multipath code enabled, multiple controller per
2992         * subsystems are visible as devices and thus we cannot use the
2993         * subsystem instance.
2994         */
2995        sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
2996#endif
2997
2998        if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
2999                if (nvme_nvm_register(ns, disk_name, node)) {
3000                        dev_warn(ctrl->device, "LightNVM init failure\n");
3001                        goto out_unlink_ns;
3002                }
3003        }
3004
3005        disk = alloc_disk_node(0, node);
3006        if (!disk)
3007                goto out_unlink_ns;
3008
3009        disk->fops = &nvme_fops;
3010        disk->private_data = ns;
3011        disk->queue = ns->queue;
3012        disk->flags = flags;
3013        memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
3014        ns->disk = disk;
3015
3016        __nvme_revalidate_disk(disk, id);
3017
3018        mutex_lock(&ctrl->namespaces_mutex);
3019        list_add_tail(&ns->list, &ctrl->namespaces);
3020        mutex_unlock(&ctrl->namespaces_mutex);
3021
3022        nvme_get_ctrl(ctrl);
3023
3024        kfree(id);
3025
3026        device_add_disk(ctrl->device, ns->disk);
3027        if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
3028                                        &nvme_ns_id_attr_group))
3029                pr_warn("%s: failed to create sysfs group for identification\n",
3030                        ns->disk->disk_name);
3031        if (ns->ndev && nvme_nvm_register_sysfs(ns))
3032                pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
3033                        ns->disk->disk_name);
3034
3035        nvme_mpath_add_disk(ns->head);
3036        return;
3037 out_unlink_ns:
3038        mutex_lock(&ctrl->subsys->lock);
3039        list_del_rcu(&ns->siblings);
3040        mutex_unlock(&ctrl->subsys->lock);
3041 out_free_id:
3042        kfree(id);
3043 out_free_queue:
3044        blk_cleanup_queue(ns->queue);
3045 out_free_ns:
3046        kfree(ns);
3047}
3048
3049static void nvme_ns_remove(struct nvme_ns *ns)
3050{
3051        if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
3052                return;
3053
3054        if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
3055                sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
3056                                        &nvme_ns_id_attr_group);
3057                if (ns->ndev)
3058                        nvme_nvm_unregister_sysfs(ns);
3059                del_gendisk(ns->disk);
3060                blk_cleanup_queue(ns->queue);
3061                if (blk_get_integrity(ns->disk))
3062                        blk_integrity_unregister(ns->disk);
3063        }
3064
3065        mutex_lock(&ns->ctrl->subsys->lock);
3066        nvme_mpath_clear_current_path(ns);
3067        list_del_rcu(&ns->siblings);
3068        mutex_unlock(&ns->ctrl->subsys->lock);
3069
3070        mutex_lock(&ns->ctrl->namespaces_mutex);
3071        list_del_init(&ns->list);
3072        mutex_unlock(&ns->ctrl->namespaces_mutex);
3073
3074        synchronize_srcu(&ns->head->srcu);
3075        nvme_mpath_check_last_path(ns);
3076        nvme_put_ns(ns);
3077}
3078
3079static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3080{
3081        struct nvme_ns *ns;
3082
3083        ns = nvme_find_get_ns(ctrl, nsid);
3084        if (ns) {
3085                if (ns->disk && revalidate_disk(ns->disk))
3086                        nvme_ns_remove(ns);
3087                nvme_put_ns(ns);
3088        } else
3089                nvme_alloc_ns(ctrl, nsid);
3090}
3091
3092static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
3093                                        unsigned nsid)
3094{
3095        struct nvme_ns *ns, *next;
3096
3097        list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
3098                if (ns->head->ns_id > nsid)
3099                        nvme_ns_remove(ns);
3100        }
3101}
3102
3103static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
3104{
3105        struct nvme_ns *ns;
3106        __le32 *ns_list;
3107        unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024);
3108        int ret = 0;
3109
3110        ns_list = kzalloc(0x1000, GFP_KERNEL);
3111        if (!ns_list)
3112                return -ENOMEM;
3113
3114        for (i = 0; i < num_lists; i++) {
3115                ret = nvme_identify_ns_list(ctrl, prev, ns_list);
3116                if (ret)
3117                        goto free;
3118
3119                for (j = 0; j < min(nn, 1024U); j++) {
3120                        nsid = le32_to_cpu(ns_list[j]);
3121                        if (!nsid)
3122                                goto out;
3123
3124                        nvme_validate_ns(ctrl, nsid);
3125
3126                        while (++prev < nsid) {
3127                                ns = nvme_find_get_ns(ctrl, prev);
3128                                if (ns) {
3129                                        nvme_ns_remove(ns);
3130                                        nvme_put_ns(ns);
3131                                }
3132                        }
3133                }
3134                nn -= j;
3135        }
3136 out:
3137        nvme_remove_invalid_namespaces(ctrl, prev);
3138 free:
3139        kfree(ns_list);
3140        return ret;
3141}
3142
3143static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn)
3144{
3145        unsigned i;
3146
3147        for (i = 1; i <= nn; i++)
3148                nvme_validate_ns(ctrl, i);
3149
3150        nvme_remove_invalid_namespaces(ctrl, nn);
3151}
3152
3153static void nvme_scan_work(struct work_struct *work)
3154{
3155        struct nvme_ctrl *ctrl =
3156                container_of(work, struct nvme_ctrl, scan_work);
3157        struct nvme_id_ctrl *id;
3158        unsigned nn;
3159
3160        if (ctrl->state != NVME_CTRL_LIVE)
3161                return;
3162
3163        WARN_ON_ONCE(!ctrl->tagset);
3164
3165        if (nvme_identify_ctrl(ctrl, &id))
3166                return;
3167
3168        nn = le32_to_cpu(id->nn);
3169        if (ctrl->vs >= NVME_VS(1, 1, 0) &&
3170            !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
3171                if (!nvme_scan_ns_list(ctrl, nn))
3172                        goto done;
3173        }
3174        nvme_scan_ns_sequential(ctrl, nn);
3175 done:
3176        mutex_lock(&ctrl->namespaces_mutex);
3177        list_sort(NULL, &ctrl->namespaces, ns_cmp);
3178        mutex_unlock(&ctrl->namespaces_mutex);
3179        kfree(id);
3180}
3181
3182void nvme_queue_scan(struct nvme_ctrl *ctrl)
3183{
3184        /*
3185         * Only new queue scan work when admin and IO queues are both alive
3186         */
3187        if (ctrl->state == NVME_CTRL_LIVE)
3188                queue_work(nvme_wq, &ctrl->scan_work);
3189}
3190EXPORT_SYMBOL_GPL(nvme_queue_scan);
3191
3192/*
3193 * This function iterates the namespace list unlocked to allow recovery from
3194 * controller failure. It is up to the caller to ensure the namespace list is
3195 * not modified by scan work while this function is executing.
3196 */
3197void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
3198{
3199        struct nvme_ns *ns, *next;
3200
3201        /*
3202         * The dead states indicates the controller was not gracefully
3203         * disconnected. In that case, we won't be able to flush any data while
3204         * removing the namespaces' disks; fail all the queues now to avoid
3205         * potentially having to clean up the failed sync later.
3206         */
3207        if (ctrl->state == NVME_CTRL_DEAD)
3208                nvme_kill_queues(ctrl);
3209
3210        list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
3211                nvme_ns_remove(ns);
3212}
3213EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
3214
3215static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
3216{
3217        char *envp[2] = { NULL, NULL };
3218        u32 aen_result = ctrl->aen_result;
3219
3220        ctrl->aen_result = 0;
3221        if (!aen_result)
3222                return;
3223
3224        envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
3225        if (!envp[0])
3226                return;
3227        kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
3228        kfree(envp[0]);
3229}
3230
3231static void nvme_async_event_work(struct work_struct *work)
3232{
3233        struct nvme_ctrl *ctrl =
3234                container_of(work, struct nvme_ctrl, async_event_work);
3235
3236        nvme_aen_uevent(ctrl);
3237        ctrl->ops->submit_async_event(ctrl);
3238}
3239
3240static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
3241{
3242
3243        u32 csts;
3244
3245        if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
3246                return false;
3247
3248        if (csts == ~0)
3249                return false;
3250
3251        return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
3252}
3253
3254static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
3255{
3256        struct nvme_fw_slot_info_log *log;
3257
3258        log = kmalloc(sizeof(*log), GFP_KERNEL);
3259        if (!log)
3260                return;
3261
3262        if (nvme_get_log(ctrl, NVME_LOG_FW_SLOT, log, sizeof(*log)))
3263                dev_warn(ctrl->device,
3264                                "Get FW SLOT INFO log error\n");
3265        kfree(log);
3266}
3267
3268static void nvme_fw_act_work(struct work_struct *work)
3269{
3270        struct nvme_ctrl *ctrl = container_of(work,
3271                                struct nvme_ctrl, fw_act_work);
3272        unsigned long fw_act_timeout;
3273
3274        if (ctrl->mtfa)
3275                fw_act_timeout = jiffies +
3276                                msecs_to_jiffies(ctrl->mtfa * 100);
3277        else
3278                fw_act_timeout = jiffies +
3279                                msecs_to_jiffies(admin_timeout * 1000);
3280
3281        nvme_stop_queues(ctrl);
3282        while (nvme_ctrl_pp_status(ctrl)) {
3283                if (time_after(jiffies, fw_act_timeout)) {
3284                        dev_warn(ctrl->device,
3285                                "Fw activation timeout, reset controller\n");
3286                        nvme_reset_ctrl(ctrl);
3287                        break;
3288                }
3289                msleep(100);
3290        }
3291
3292        if (ctrl->state != NVME_CTRL_LIVE)
3293                return;
3294
3295        nvme_start_queues(ctrl);
3296        /* read FW slot information to clear the AER */
3297        nvme_get_fw_slot_info(ctrl);
3298}
3299
3300void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
3301                union nvme_result *res)
3302{
3303        u32 result = le32_to_cpu(res->u32);
3304
3305        if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
3306                return;
3307
3308        switch (result & 0x7) {
3309        case NVME_AER_ERROR:
3310        case NVME_AER_SMART:
3311        case NVME_AER_CSS:
3312        case NVME_AER_VS:
3313                ctrl->aen_result = result;
3314                break;
3315        default:
3316                break;
3317        }
3318
3319        switch (result & 0xff07) {
3320        case NVME_AER_NOTICE_NS_CHANGED:
3321                dev_info(ctrl->device, "rescanning\n");
3322                nvme_queue_scan(ctrl);
3323                break;
3324        case NVME_AER_NOTICE_FW_ACT_STARTING:
3325                queue_work(nvme_wq, &ctrl->fw_act_work);
3326                break;
3327        default:
3328                dev_warn(ctrl->device, "async event result %08x\n", result);
3329        }
3330        queue_work(nvme_wq, &ctrl->async_event_work);
3331}
3332EXPORT_SYMBOL_GPL(nvme_complete_async_event);
3333
3334void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
3335{
3336        nvme_stop_keep_alive(ctrl);
3337        flush_work(&ctrl->async_event_work);
3338        flush_work(&ctrl->scan_work);
3339        cancel_work_sync(&ctrl->fw_act_work);
3340}
3341EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
3342
3343void nvme_start_ctrl(struct nvme_ctrl *ctrl)
3344{
3345        if (ctrl->kato)
3346                nvme_start_keep_alive(ctrl);
3347
3348        if (ctrl->queue_count > 1) {
3349                nvme_queue_scan(ctrl);
3350                queue_work(nvme_wq, &ctrl->async_event_work);
3351                nvme_start_queues(ctrl);
3352        }
3353}
3354EXPORT_SYMBOL_GPL(nvme_start_ctrl);
3355
3356void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
3357{
3358        cdev_device_del(&ctrl->cdev, ctrl->device);
3359}
3360EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
3361
3362static void nvme_free_ctrl(struct device *dev)
3363{
3364        struct nvme_ctrl *ctrl =
3365                container_of(dev, struct nvme_ctrl, ctrl_device);
3366        struct nvme_subsystem *subsys = ctrl->subsys;
3367
3368        ida_simple_remove(&nvme_instance_ida, ctrl->instance);
3369        kfree(ctrl->effects);
3370
3371        if (subsys) {
3372                mutex_lock(&subsys->lock);
3373                list_del(&ctrl->subsys_entry);
3374                mutex_unlock(&subsys->lock);
3375                sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
3376        }
3377
3378        ctrl->ops->free_ctrl(ctrl);
3379
3380        if (subsys)
3381                nvme_put_subsystem(subsys);
3382}
3383
3384/*
3385 * Initialize a NVMe controller structures.  This needs to be called during
3386 * earliest initialization so that we have the initialized structured around
3387 * during probing.
3388 */
3389int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
3390                const struct nvme_ctrl_ops *ops, unsigned long quirks)
3391{
3392        int ret;
3393
3394        ctrl->state = NVME_CTRL_NEW;
3395        spin_lock_init(&ctrl->lock);
3396        INIT_LIST_HEAD(&ctrl->namespaces);
3397        mutex_init(&ctrl->namespaces_mutex);
3398        ctrl->dev = dev;
3399        ctrl->ops = ops;
3400        ctrl->quirks = quirks;
3401        INIT_WORK(&ctrl->scan_work, nvme_scan_work);
3402        INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
3403        INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
3404        INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
3405
3406        ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
3407        if (ret < 0)
3408                goto out;
3409        ctrl->instance = ret;
3410
3411        device_initialize(&ctrl->ctrl_device);
3412        ctrl->device = &ctrl->ctrl_device;
3413        ctrl->device->devt = MKDEV(MAJOR(nvme_chr_devt), ctrl->instance);
3414        ctrl->device->class = nvme_class;
3415        ctrl->device->parent = ctrl->dev;
3416        ctrl->device->groups = nvme_dev_attr_groups;
3417        ctrl->device->release = nvme_free_ctrl;
3418        dev_set_drvdata(ctrl->device, ctrl);
3419        ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
3420        if (ret)
3421                goto out_release_instance;
3422
3423        cdev_init(&ctrl->cdev, &nvme_dev_fops);
3424        ctrl->cdev.owner = ops->module;
3425        ret = cdev_device_add(&ctrl->cdev, ctrl->device);
3426        if (ret)
3427                goto out_free_name;
3428
3429        /*
3430         * Initialize latency tolerance controls.  The sysfs files won't
3431         * be visible to userspace unless the device actually supports APST.
3432         */
3433        ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
3434        dev_pm_qos_update_user_latency_tolerance(ctrl->device,
3435                min(default_ps_max_latency_us, (unsigned long)S32_MAX));
3436
3437        return 0;
3438out_free_name:
3439        kfree_const(dev->kobj.name);
3440out_release_instance:
3441        ida_simple_remove(&nvme_instance_ida, ctrl->instance);
3442out:
3443        return ret;
3444}
3445EXPORT_SYMBOL_GPL(nvme_init_ctrl);
3446
3447/**
3448 * nvme_kill_queues(): Ends all namespace queues
3449 * @ctrl: the dead controller that needs to end
3450 *
3451 * Call this function when the driver determines it is unable to get the
3452 * controller in a state capable of servicing IO.
3453 */
3454void nvme_kill_queues(struct nvme_ctrl *ctrl)
3455{
3456        struct nvme_ns *ns;
3457
3458        mutex_lock(&ctrl->namespaces_mutex);
3459
3460        /* Forcibly unquiesce queues to avoid blocking dispatch */
3461        if (ctrl->admin_q)
3462                blk_mq_unquiesce_queue(ctrl->admin_q);
3463
3464        list_for_each_entry(ns, &ctrl->namespaces, list) {
3465                /*
3466                 * Revalidating a dead namespace sets capacity to 0. This will
3467                 * end buffered writers dirtying pages that can't be synced.
3468                 */
3469                if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags))
3470                        continue;
3471                revalidate_disk(ns->disk);
3472                blk_set_queue_dying(ns->queue);
3473
3474                /* Forcibly unquiesce queues to avoid blocking dispatch */
3475                blk_mq_unquiesce_queue(ns->queue);
3476        }
3477        mutex_unlock(&ctrl->namespaces_mutex);
3478}
3479EXPORT_SYMBOL_GPL(nvme_kill_queues);
3480
3481void nvme_unfreeze(struct nvme_ctrl *ctrl)
3482{
3483        struct nvme_ns *ns;
3484
3485        mutex_lock(&ctrl->namespaces_mutex);
3486        list_for_each_entry(ns, &ctrl->namespaces, list)
3487                blk_mq_unfreeze_queue(ns->queue);
3488        mutex_unlock(&ctrl->namespaces_mutex);
3489}
3490EXPORT_SYMBOL_GPL(nvme_unfreeze);
3491
3492void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
3493{
3494        struct nvme_ns *ns;
3495
3496        mutex_lock(&ctrl->namespaces_mutex);
3497        list_for_each_entry(ns, &ctrl->namespaces, list) {
3498                timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
3499                if (timeout <= 0)
3500                        break;
3501        }
3502        mutex_unlock(&ctrl->namespaces_mutex);
3503}
3504EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
3505
3506void nvme_wait_freeze(struct nvme_ctrl *ctrl)
3507{
3508        struct nvme_ns *ns;
3509
3510        mutex_lock(&ctrl->namespaces_mutex);
3511        list_for_each_entry(ns, &ctrl->namespaces, list)
3512                blk_mq_freeze_queue_wait(ns->queue);
3513        mutex_unlock(&ctrl->namespaces_mutex);
3514}
3515EXPORT_SYMBOL_GPL(nvme_wait_freeze);
3516
3517void nvme_start_freeze(struct nvme_ctrl *ctrl)
3518{
3519        struct nvme_ns *ns;
3520
3521        mutex_lock(&ctrl->namespaces_mutex);
3522        list_for_each_entry(ns, &ctrl->namespaces, list)
3523                blk_freeze_queue_start(ns->queue);
3524        mutex_unlock(&ctrl->namespaces_mutex);
3525}
3526EXPORT_SYMBOL_GPL(nvme_start_freeze);
3527
3528void nvme_stop_queues(struct nvme_ctrl *ctrl)
3529{
3530        struct nvme_ns *ns;
3531
3532        mutex_lock(&ctrl->namespaces_mutex);
3533        list_for_each_entry(ns, &ctrl->namespaces, list)
3534                blk_mq_quiesce_queue(ns->queue);
3535        mutex_unlock(&ctrl->namespaces_mutex);
3536}
3537EXPORT_SYMBOL_GPL(nvme_stop_queues);
3538
3539void nvme_start_queues(struct nvme_ctrl *ctrl)
3540{
3541        struct nvme_ns *ns;
3542
3543        mutex_lock(&ctrl->namespaces_mutex);
3544        list_for_each_entry(ns, &ctrl->namespaces, list)
3545                blk_mq_unquiesce_queue(ns->queue);
3546        mutex_unlock(&ctrl->namespaces_mutex);
3547}
3548EXPORT_SYMBOL_GPL(nvme_start_queues);
3549
3550int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set)
3551{
3552        if (!ctrl->ops->reinit_request)
3553                return 0;
3554
3555        return blk_mq_tagset_iter(set, set->driver_data,
3556                        ctrl->ops->reinit_request);
3557}
3558EXPORT_SYMBOL_GPL(nvme_reinit_tagset);
3559
3560int __init nvme_core_init(void)
3561{
3562        int result = -ENOMEM;
3563
3564        nvme_wq = alloc_workqueue("nvme-wq",
3565                        WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
3566        if (!nvme_wq)
3567                goto out;
3568
3569        nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
3570                        WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
3571        if (!nvme_reset_wq)
3572                goto destroy_wq;
3573
3574        nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
3575                        WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
3576        if (!nvme_delete_wq)
3577                goto destroy_reset_wq;
3578
3579        result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme");
3580        if (result < 0)
3581                goto destroy_delete_wq;
3582
3583        nvme_class = class_create(THIS_MODULE, "nvme");
3584        if (IS_ERR(nvme_class)) {
3585                result = PTR_ERR(nvme_class);
3586                goto unregister_chrdev;
3587        }
3588
3589        nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
3590        if (IS_ERR(nvme_subsys_class)) {
3591                result = PTR_ERR(nvme_subsys_class);
3592                goto destroy_class;
3593        }
3594        return 0;
3595
3596destroy_class:
3597        class_destroy(nvme_class);
3598unregister_chrdev:
3599        unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
3600destroy_delete_wq:
3601        destroy_workqueue(nvme_delete_wq);
3602destroy_reset_wq:
3603        destroy_workqueue(nvme_reset_wq);
3604destroy_wq:
3605        destroy_workqueue(nvme_wq);
3606out:
3607        return result;
3608}
3609
3610void nvme_core_exit(void)
3611{
3612        ida_destroy(&nvme_subsystems_ida);
3613        class_destroy(nvme_subsys_class);
3614        class_destroy(nvme_class);
3615        unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
3616        destroy_workqueue(nvme_delete_wq);
3617        destroy_workqueue(nvme_reset_wq);
3618        destroy_workqueue(nvme_wq);
3619}
3620
3621MODULE_LICENSE("GPL");
3622MODULE_VERSION("1.0");
3623module_init(nvme_core_init);
3624module_exit(nvme_core_exit);
3625