uboot/drivers/nvme/nvme.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0+
   2/*
   3 * Copyright (C) 2017 NXP Semiconductors
   4 * Copyright (C) 2017 Bin Meng <bmeng.cn@gmail.com>
   5 */
   6
   7#include <common.h>
   8#include <blk.h>
   9#include <cpu_func.h>
  10#include <dm.h>
  11#include <errno.h>
  12#include <log.h>
  13#include <malloc.h>
  14#include <memalign.h>
  15#include <pci.h>
  16#include <time.h>
  17#include <dm/device-internal.h>
  18#include <linux/compat.h>
  19#include "nvme.h"
  20
  21#define NVME_Q_DEPTH            2
  22#define NVME_AQ_DEPTH           2
  23#define NVME_SQ_SIZE(depth)     (depth * sizeof(struct nvme_command))
  24#define NVME_CQ_SIZE(depth)     (depth * sizeof(struct nvme_completion))
  25#define NVME_CQ_ALLOCATION      ALIGN(NVME_CQ_SIZE(NVME_Q_DEPTH), \
  26                                      ARCH_DMA_MINALIGN)
  27#define ADMIN_TIMEOUT           60
  28#define IO_TIMEOUT              30
  29#define MAX_PRP_POOL            512
  30
  31enum nvme_queue_id {
  32        NVME_ADMIN_Q,
  33        NVME_IO_Q,
  34        NVME_Q_NUM,
  35};
  36
  37/*
  38 * An NVM Express queue. Each device has at least two (one for admin
  39 * commands and one for I/O commands).
  40 */
  41struct nvme_queue {
  42        struct nvme_dev *dev;
  43        struct nvme_command *sq_cmds;
  44        struct nvme_completion *cqes;
  45        wait_queue_head_t sq_full;
  46        u32 __iomem *q_db;
  47        u16 q_depth;
  48        s16 cq_vector;
  49        u16 sq_head;
  50        u16 sq_tail;
  51        u16 cq_head;
  52        u16 qid;
  53        u8 cq_phase;
  54        u8 cqe_seen;
  55        unsigned long cmdid_data[];
  56};
  57
  58static int nvme_wait_ready(struct nvme_dev *dev, bool enabled)
  59{
  60        u32 bit = enabled ? NVME_CSTS_RDY : 0;
  61        int timeout;
  62        ulong start;
  63
  64        /* Timeout field in the CAP register is in 500 millisecond units */
  65        timeout = NVME_CAP_TIMEOUT(dev->cap) * 500;
  66
  67        start = get_timer(0);
  68        while (get_timer(start) < timeout) {
  69                if ((readl(&dev->bar->csts) & NVME_CSTS_RDY) == bit)
  70                        return 0;
  71        }
  72
  73        return -ETIME;
  74}
  75
  76static int nvme_setup_prps(struct nvme_dev *dev, u64 *prp2,
  77                           int total_len, u64 dma_addr)
  78{
  79        u32 page_size = dev->page_size;
  80        int offset = dma_addr & (page_size - 1);
  81        u64 *prp_pool;
  82        int length = total_len;
  83        int i, nprps;
  84        u32 prps_per_page = page_size >> 3;
  85        u32 num_pages;
  86
  87        length -= (page_size - offset);
  88
  89        if (length <= 0) {
  90                *prp2 = 0;
  91                return 0;
  92        }
  93
  94        if (length)
  95                dma_addr += (page_size - offset);
  96
  97        if (length <= page_size) {
  98                *prp2 = dma_addr;
  99                return 0;
 100        }
 101
 102        nprps = DIV_ROUND_UP(length, page_size);
 103        num_pages = DIV_ROUND_UP(nprps, prps_per_page);
 104
 105        if (nprps > dev->prp_entry_num) {
 106                free(dev->prp_pool);
 107                /*
 108                 * Always increase in increments of pages.  It doesn't waste
 109                 * much memory and reduces the number of allocations.
 110                 */
 111                dev->prp_pool = memalign(page_size, num_pages * page_size);
 112                if (!dev->prp_pool) {
 113                        printf("Error: malloc prp_pool fail\n");
 114                        return -ENOMEM;
 115                }
 116                dev->prp_entry_num = prps_per_page * num_pages;
 117        }
 118
 119        prp_pool = dev->prp_pool;
 120        i = 0;
 121        while (nprps) {
 122                if (i == ((page_size >> 3) - 1)) {
 123                        *(prp_pool + i) = cpu_to_le64((ulong)prp_pool +
 124                                        page_size);
 125                        i = 0;
 126                        prp_pool += page_size;
 127                }
 128                *(prp_pool + i++) = cpu_to_le64(dma_addr);
 129                dma_addr += page_size;
 130                nprps--;
 131        }
 132        *prp2 = (ulong)dev->prp_pool;
 133
 134        flush_dcache_range((ulong)dev->prp_pool, (ulong)dev->prp_pool +
 135                           dev->prp_entry_num * sizeof(u64));
 136
 137        return 0;
 138}
 139
 140static __le16 nvme_get_cmd_id(void)
 141{
 142        static unsigned short cmdid;
 143
 144        return cpu_to_le16((cmdid < USHRT_MAX) ? cmdid++ : 0);
 145}
 146
 147static u16 nvme_read_completion_status(struct nvme_queue *nvmeq, u16 index)
 148{
 149        /*
 150         * Single CQ entries are always smaller than a cache line, so we
 151         * can't invalidate them individually. However CQ entries are
 152         * read only by the CPU, so it's safe to always invalidate all of them,
 153         * as the cache line should never become dirty.
 154         */
 155        ulong start = (ulong)&nvmeq->cqes[0];
 156        ulong stop = start + NVME_CQ_ALLOCATION;
 157
 158        invalidate_dcache_range(start, stop);
 159
 160        return readw(&(nvmeq->cqes[index].status));
 161}
 162
 163/**
 164 * nvme_submit_cmd() - copy a command into a queue and ring the doorbell
 165 *
 166 * @nvmeq:      The queue to use
 167 * @cmd:        The command to send
 168 */
 169static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 170{
 171        u16 tail = nvmeq->sq_tail;
 172
 173        memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
 174        flush_dcache_range((ulong)&nvmeq->sq_cmds[tail],
 175                           (ulong)&nvmeq->sq_cmds[tail] + sizeof(*cmd));
 176
 177        if (++tail == nvmeq->q_depth)
 178                tail = 0;
 179        writel(tail, nvmeq->q_db);
 180        nvmeq->sq_tail = tail;
 181}
 182
 183static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq,
 184                                struct nvme_command *cmd,
 185                                u32 *result, unsigned timeout)
 186{
 187        u16 head = nvmeq->cq_head;
 188        u16 phase = nvmeq->cq_phase;
 189        u16 status;
 190        ulong start_time;
 191        ulong timeout_us = timeout * 100000;
 192
 193        cmd->common.command_id = nvme_get_cmd_id();
 194        nvme_submit_cmd(nvmeq, cmd);
 195
 196        start_time = timer_get_us();
 197
 198        for (;;) {
 199                status = nvme_read_completion_status(nvmeq, head);
 200                if ((status & 0x01) == phase)
 201                        break;
 202                if (timeout_us > 0 && (timer_get_us() - start_time)
 203                    >= timeout_us)
 204                        return -ETIMEDOUT;
 205        }
 206
 207        status >>= 1;
 208        if (status) {
 209                printf("ERROR: status = %x, phase = %d, head = %d\n",
 210                       status, phase, head);
 211                status = 0;
 212                if (++head == nvmeq->q_depth) {
 213                        head = 0;
 214                        phase = !phase;
 215                }
 216                writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
 217                nvmeq->cq_head = head;
 218                nvmeq->cq_phase = phase;
 219
 220                return -EIO;
 221        }
 222
 223        if (result)
 224                *result = readl(&(nvmeq->cqes[head].result));
 225
 226        if (++head == nvmeq->q_depth) {
 227                head = 0;
 228                phase = !phase;
 229        }
 230        writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
 231        nvmeq->cq_head = head;
 232        nvmeq->cq_phase = phase;
 233
 234        return status;
 235}
 236
 237static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
 238                                 u32 *result)
 239{
 240        return nvme_submit_sync_cmd(dev->queues[NVME_ADMIN_Q], cmd,
 241                                    result, ADMIN_TIMEOUT);
 242}
 243
 244static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev,
 245                                           int qid, int depth)
 246{
 247        struct nvme_queue *nvmeq = malloc(sizeof(*nvmeq));
 248        if (!nvmeq)
 249                return NULL;
 250        memset(nvmeq, 0, sizeof(*nvmeq));
 251
 252        nvmeq->cqes = (void *)memalign(4096, NVME_CQ_ALLOCATION);
 253        if (!nvmeq->cqes)
 254                goto free_nvmeq;
 255        memset((void *)nvmeq->cqes, 0, NVME_CQ_SIZE(depth));
 256
 257        nvmeq->sq_cmds = (void *)memalign(4096, NVME_SQ_SIZE(depth));
 258        if (!nvmeq->sq_cmds)
 259                goto free_queue;
 260        memset((void *)nvmeq->sq_cmds, 0, NVME_SQ_SIZE(depth));
 261
 262        nvmeq->dev = dev;
 263
 264        nvmeq->cq_head = 0;
 265        nvmeq->cq_phase = 1;
 266        nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
 267        nvmeq->q_depth = depth;
 268        nvmeq->qid = qid;
 269        dev->queue_count++;
 270        dev->queues[qid] = nvmeq;
 271
 272        return nvmeq;
 273
 274 free_queue:
 275        free((void *)nvmeq->cqes);
 276 free_nvmeq:
 277        free(nvmeq);
 278
 279        return NULL;
 280}
 281
 282static int nvme_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
 283{
 284        struct nvme_command c;
 285
 286        memset(&c, 0, sizeof(c));
 287        c.delete_queue.opcode = opcode;
 288        c.delete_queue.qid = cpu_to_le16(id);
 289
 290        return nvme_submit_admin_cmd(dev, &c, NULL);
 291}
 292
 293static int nvme_delete_sq(struct nvme_dev *dev, u16 sqid)
 294{
 295        return nvme_delete_queue(dev, nvme_admin_delete_sq, sqid);
 296}
 297
 298static int nvme_delete_cq(struct nvme_dev *dev, u16 cqid)
 299{
 300        return nvme_delete_queue(dev, nvme_admin_delete_cq, cqid);
 301}
 302
 303static int nvme_enable_ctrl(struct nvme_dev *dev)
 304{
 305        dev->ctrl_config &= ~NVME_CC_SHN_MASK;
 306        dev->ctrl_config |= NVME_CC_ENABLE;
 307        writel(dev->ctrl_config, &dev->bar->cc);
 308
 309        return nvme_wait_ready(dev, true);
 310}
 311
 312static int nvme_disable_ctrl(struct nvme_dev *dev)
 313{
 314        dev->ctrl_config &= ~NVME_CC_SHN_MASK;
 315        dev->ctrl_config &= ~NVME_CC_ENABLE;
 316        writel(dev->ctrl_config, &dev->bar->cc);
 317
 318        return nvme_wait_ready(dev, false);
 319}
 320
 321static void nvme_free_queue(struct nvme_queue *nvmeq)
 322{
 323        free((void *)nvmeq->cqes);
 324        free(nvmeq->sq_cmds);
 325        free(nvmeq);
 326}
 327
 328static void nvme_free_queues(struct nvme_dev *dev, int lowest)
 329{
 330        int i;
 331
 332        for (i = dev->queue_count - 1; i >= lowest; i--) {
 333                struct nvme_queue *nvmeq = dev->queues[i];
 334                dev->queue_count--;
 335                dev->queues[i] = NULL;
 336                nvme_free_queue(nvmeq);
 337        }
 338}
 339
 340static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 341{
 342        struct nvme_dev *dev = nvmeq->dev;
 343
 344        nvmeq->sq_tail = 0;
 345        nvmeq->cq_head = 0;
 346        nvmeq->cq_phase = 1;
 347        nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
 348        memset((void *)nvmeq->cqes, 0, NVME_CQ_SIZE(nvmeq->q_depth));
 349        flush_dcache_range((ulong)nvmeq->cqes,
 350                           (ulong)nvmeq->cqes + NVME_CQ_ALLOCATION);
 351        dev->online_queues++;
 352}
 353
 354static int nvme_configure_admin_queue(struct nvme_dev *dev)
 355{
 356        int result;
 357        u32 aqa;
 358        u64 cap = dev->cap;
 359        struct nvme_queue *nvmeq;
 360        /* most architectures use 4KB as the page size */
 361        unsigned page_shift = 12;
 362        unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12;
 363        unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12;
 364
 365        if (page_shift < dev_page_min) {
 366                debug("Device minimum page size (%u) too large for host (%u)\n",
 367                      1 << dev_page_min, 1 << page_shift);
 368                return -ENODEV;
 369        }
 370
 371        if (page_shift > dev_page_max) {
 372                debug("Device maximum page size (%u) smaller than host (%u)\n",
 373                      1 << dev_page_max, 1 << page_shift);
 374                page_shift = dev_page_max;
 375        }
 376
 377        result = nvme_disable_ctrl(dev);
 378        if (result < 0)
 379                return result;
 380
 381        nvmeq = dev->queues[NVME_ADMIN_Q];
 382        if (!nvmeq) {
 383                nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
 384                if (!nvmeq)
 385                        return -ENOMEM;
 386        }
 387
 388        aqa = nvmeq->q_depth - 1;
 389        aqa |= aqa << 16;
 390
 391        dev->page_size = 1 << page_shift;
 392
 393        dev->ctrl_config = NVME_CC_CSS_NVM;
 394        dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
 395        dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
 396        dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
 397
 398        writel(aqa, &dev->bar->aqa);
 399        nvme_writeq((ulong)nvmeq->sq_cmds, &dev->bar->asq);
 400        nvme_writeq((ulong)nvmeq->cqes, &dev->bar->acq);
 401
 402        result = nvme_enable_ctrl(dev);
 403        if (result)
 404                goto free_nvmeq;
 405
 406        nvmeq->cq_vector = 0;
 407
 408        nvme_init_queue(dev->queues[NVME_ADMIN_Q], 0);
 409
 410        return result;
 411
 412 free_nvmeq:
 413        nvme_free_queues(dev, 0);
 414
 415        return result;
 416}
 417
 418static int nvme_alloc_cq(struct nvme_dev *dev, u16 qid,
 419                            struct nvme_queue *nvmeq)
 420{
 421        struct nvme_command c;
 422        int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
 423
 424        memset(&c, 0, sizeof(c));
 425        c.create_cq.opcode = nvme_admin_create_cq;
 426        c.create_cq.prp1 = cpu_to_le64((ulong)nvmeq->cqes);
 427        c.create_cq.cqid = cpu_to_le16(qid);
 428        c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
 429        c.create_cq.cq_flags = cpu_to_le16(flags);
 430        c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
 431
 432        return nvme_submit_admin_cmd(dev, &c, NULL);
 433}
 434
 435static int nvme_alloc_sq(struct nvme_dev *dev, u16 qid,
 436                            struct nvme_queue *nvmeq)
 437{
 438        struct nvme_command c;
 439        int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
 440
 441        memset(&c, 0, sizeof(c));
 442        c.create_sq.opcode = nvme_admin_create_sq;
 443        c.create_sq.prp1 = cpu_to_le64((ulong)nvmeq->sq_cmds);
 444        c.create_sq.sqid = cpu_to_le16(qid);
 445        c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
 446        c.create_sq.sq_flags = cpu_to_le16(flags);
 447        c.create_sq.cqid = cpu_to_le16(qid);
 448
 449        return nvme_submit_admin_cmd(dev, &c, NULL);
 450}
 451
 452int nvme_identify(struct nvme_dev *dev, unsigned nsid,
 453                  unsigned cns, dma_addr_t dma_addr)
 454{
 455        struct nvme_command c;
 456        u32 page_size = dev->page_size;
 457        int offset = dma_addr & (page_size - 1);
 458        int length = sizeof(struct nvme_id_ctrl);
 459        int ret;
 460
 461        memset(&c, 0, sizeof(c));
 462        c.identify.opcode = nvme_admin_identify;
 463        c.identify.nsid = cpu_to_le32(nsid);
 464        c.identify.prp1 = cpu_to_le64(dma_addr);
 465
 466        length -= (page_size - offset);
 467        if (length <= 0) {
 468                c.identify.prp2 = 0;
 469        } else {
 470                dma_addr += (page_size - offset);
 471                c.identify.prp2 = cpu_to_le64(dma_addr);
 472        }
 473
 474        c.identify.cns = cpu_to_le32(cns);
 475
 476        invalidate_dcache_range(dma_addr,
 477                                dma_addr + sizeof(struct nvme_id_ctrl));
 478
 479        ret = nvme_submit_admin_cmd(dev, &c, NULL);
 480        if (!ret)
 481                invalidate_dcache_range(dma_addr,
 482                                        dma_addr + sizeof(struct nvme_id_ctrl));
 483
 484        return ret;
 485}
 486
 487int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
 488                      dma_addr_t dma_addr, u32 *result)
 489{
 490        struct nvme_command c;
 491        int ret;
 492
 493        memset(&c, 0, sizeof(c));
 494        c.features.opcode = nvme_admin_get_features;
 495        c.features.nsid = cpu_to_le32(nsid);
 496        c.features.prp1 = cpu_to_le64(dma_addr);
 497        c.features.fid = cpu_to_le32(fid);
 498
 499        ret = nvme_submit_admin_cmd(dev, &c, result);
 500
 501        /*
 502         * TODO: Add some cache invalidation when a DMA buffer is involved
 503         * in the request, here and before the command gets submitted. The
 504         * buffer size varies by feature, also some features use a different
 505         * field in the command packet to hold the buffer address.
 506         * Section 5.21.1 (Set Features command) in the NVMe specification
 507         * details the buffer requirements for each feature.
 508         *
 509         * At the moment there is no user of this function.
 510         */
 511
 512        return ret;
 513}
 514
 515int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
 516                      dma_addr_t dma_addr, u32 *result)
 517{
 518        struct nvme_command c;
 519
 520        memset(&c, 0, sizeof(c));
 521        c.features.opcode = nvme_admin_set_features;
 522        c.features.prp1 = cpu_to_le64(dma_addr);
 523        c.features.fid = cpu_to_le32(fid);
 524        c.features.dword11 = cpu_to_le32(dword11);
 525
 526        /*
 527         * TODO: Add a cache clean (aka flush) operation when a DMA buffer is
 528         * involved in the request. The buffer size varies by feature, also
 529         * some features use a different field in the command packet to hold
 530         * the buffer address. Section 5.21.1 (Set Features command) in the
 531         * NVMe specification details the buffer requirements for each
 532         * feature.
 533         * At the moment the only user of this function is not using
 534         * any DMA buffer at all.
 535         */
 536
 537        return nvme_submit_admin_cmd(dev, &c, result);
 538}
 539
 540static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 541{
 542        struct nvme_dev *dev = nvmeq->dev;
 543        int result;
 544
 545        nvmeq->cq_vector = qid - 1;
 546        result = nvme_alloc_cq(dev, qid, nvmeq);
 547        if (result < 0)
 548                goto release_cq;
 549
 550        result = nvme_alloc_sq(dev, qid, nvmeq);
 551        if (result < 0)
 552                goto release_sq;
 553
 554        nvme_init_queue(nvmeq, qid);
 555
 556        return result;
 557
 558 release_sq:
 559        nvme_delete_sq(dev, qid);
 560 release_cq:
 561        nvme_delete_cq(dev, qid);
 562
 563        return result;
 564}
 565
 566static int nvme_set_queue_count(struct nvme_dev *dev, int count)
 567{
 568        int status;
 569        u32 result;
 570        u32 q_count = (count - 1) | ((count - 1) << 16);
 571
 572        status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES,
 573                        q_count, 0, &result);
 574
 575        if (status < 0)
 576                return status;
 577        if (status > 1)
 578                return 0;
 579
 580        return min(result & 0xffff, result >> 16) + 1;
 581}
 582
 583static void nvme_create_io_queues(struct nvme_dev *dev)
 584{
 585        unsigned int i;
 586
 587        for (i = dev->queue_count; i <= dev->max_qid; i++)
 588                if (!nvme_alloc_queue(dev, i, dev->q_depth))
 589                        break;
 590
 591        for (i = dev->online_queues; i <= dev->queue_count - 1; i++)
 592                if (nvme_create_queue(dev->queues[i], i))
 593                        break;
 594}
 595
 596static int nvme_setup_io_queues(struct nvme_dev *dev)
 597{
 598        int nr_io_queues;
 599        int result;
 600
 601        nr_io_queues = 1;
 602        result = nvme_set_queue_count(dev, nr_io_queues);
 603        if (result <= 0)
 604                return result;
 605
 606        dev->max_qid = nr_io_queues;
 607
 608        /* Free previously allocated queues */
 609        nvme_free_queues(dev, nr_io_queues + 1);
 610        nvme_create_io_queues(dev);
 611
 612        return 0;
 613}
 614
 615static int nvme_get_info_from_identify(struct nvme_dev *dev)
 616{
 617        struct nvme_id_ctrl *ctrl;
 618        int ret;
 619        int shift = NVME_CAP_MPSMIN(dev->cap) + 12;
 620
 621        ctrl = memalign(dev->page_size, sizeof(struct nvme_id_ctrl));
 622        if (!ctrl)
 623                return -ENOMEM;
 624
 625        ret = nvme_identify(dev, 0, 1, (dma_addr_t)(long)ctrl);
 626        if (ret) {
 627                free(ctrl);
 628                return -EIO;
 629        }
 630
 631        dev->nn = le32_to_cpu(ctrl->nn);
 632        dev->vwc = ctrl->vwc;
 633        memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
 634        memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
 635        memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
 636        if (ctrl->mdts)
 637                dev->max_transfer_shift = (ctrl->mdts + shift);
 638        else {
 639                /*
 640                 * Maximum Data Transfer Size (MDTS) field indicates the maximum
 641                 * data transfer size between the host and the controller. The
 642                 * host should not submit a command that exceeds this transfer
 643                 * size. The value is in units of the minimum memory page size
 644                 * and is reported as a power of two (2^n).
 645                 *
 646                 * The spec also says: a value of 0h indicates no restrictions
 647                 * on transfer size. But in nvme_blk_read/write() below we have
 648                 * the following algorithm for maximum number of logic blocks
 649                 * per transfer:
 650                 *
 651                 * u16 lbas = 1 << (dev->max_transfer_shift - ns->lba_shift);
 652                 *
 653                 * In order for lbas not to overflow, the maximum number is 15
 654                 * which means dev->max_transfer_shift = 15 + 9 (ns->lba_shift).
 655                 * Let's use 20 which provides 1MB size.
 656                 */
 657                dev->max_transfer_shift = 20;
 658        }
 659
 660        free(ctrl);
 661        return 0;
 662}
 663
 664int nvme_get_namespace_id(struct udevice *udev, u32 *ns_id, u8 *eui64)
 665{
 666        struct nvme_ns *ns = dev_get_priv(udev);
 667
 668        if (ns_id)
 669                *ns_id = ns->ns_id;
 670        if (eui64)
 671                memcpy(eui64, ns->eui64, sizeof(ns->eui64));
 672
 673        return 0;
 674}
 675
 676int nvme_scan_namespace(void)
 677{
 678        struct uclass *uc;
 679        struct udevice *dev;
 680        int ret;
 681
 682        ret = uclass_get(UCLASS_NVME, &uc);
 683        if (ret)
 684                return ret;
 685
 686        uclass_foreach_dev(dev, uc) {
 687                ret = device_probe(dev);
 688                if (ret)
 689                        return ret;
 690        }
 691
 692        return 0;
 693}
 694
 695static int nvme_blk_probe(struct udevice *udev)
 696{
 697        struct nvme_dev *ndev = dev_get_priv(udev->parent);
 698        struct blk_desc *desc = dev_get_uclass_plat(udev);
 699        struct nvme_ns *ns = dev_get_priv(udev);
 700        u8 flbas;
 701        struct pci_child_plat *pplat;
 702        struct nvme_id_ns *id;
 703
 704        id = memalign(ndev->page_size, sizeof(struct nvme_id_ns));
 705        if (!id)
 706                return -ENOMEM;
 707
 708        ns->dev = ndev;
 709        /* extract the namespace id from the block device name */
 710        ns->ns_id = trailing_strtol(udev->name);
 711        if (nvme_identify(ndev, ns->ns_id, 0, (dma_addr_t)(long)id)) {
 712                free(id);
 713                return -EIO;
 714        }
 715
 716        memcpy(&ns->eui64, &id->eui64, sizeof(id->eui64));
 717        flbas = id->flbas & NVME_NS_FLBAS_LBA_MASK;
 718        ns->flbas = flbas;
 719        ns->lba_shift = id->lbaf[flbas].ds;
 720        list_add(&ns->list, &ndev->namespaces);
 721
 722        desc->lba = le64_to_cpu(id->nsze);
 723        desc->log2blksz = ns->lba_shift;
 724        desc->blksz = 1 << ns->lba_shift;
 725        desc->bdev = udev;
 726        pplat = dev_get_parent_plat(udev->parent);
 727        sprintf(desc->vendor, "0x%.4x", pplat->vendor);
 728        memcpy(desc->product, ndev->serial, sizeof(ndev->serial));
 729        memcpy(desc->revision, ndev->firmware_rev, sizeof(ndev->firmware_rev));
 730
 731        free(id);
 732        return 0;
 733}
 734
 735static ulong nvme_blk_rw(struct udevice *udev, lbaint_t blknr,
 736                         lbaint_t blkcnt, void *buffer, bool read)
 737{
 738        struct nvme_ns *ns = dev_get_priv(udev);
 739        struct nvme_dev *dev = ns->dev;
 740        struct nvme_command c;
 741        struct blk_desc *desc = dev_get_uclass_plat(udev);
 742        int status;
 743        u64 prp2;
 744        u64 total_len = blkcnt << desc->log2blksz;
 745        u64 temp_len = total_len;
 746        uintptr_t temp_buffer = (uintptr_t)buffer;
 747
 748        u64 slba = blknr;
 749        u16 lbas = 1 << (dev->max_transfer_shift - ns->lba_shift);
 750        u64 total_lbas = blkcnt;
 751
 752        flush_dcache_range((unsigned long)buffer,
 753                           (unsigned long)buffer + total_len);
 754
 755        c.rw.opcode = read ? nvme_cmd_read : nvme_cmd_write;
 756        c.rw.flags = 0;
 757        c.rw.nsid = cpu_to_le32(ns->ns_id);
 758        c.rw.control = 0;
 759        c.rw.dsmgmt = 0;
 760        c.rw.reftag = 0;
 761        c.rw.apptag = 0;
 762        c.rw.appmask = 0;
 763        c.rw.metadata = 0;
 764
 765        while (total_lbas) {
 766                if (total_lbas < lbas) {
 767                        lbas = (u16)total_lbas;
 768                        total_lbas = 0;
 769                } else {
 770                        total_lbas -= lbas;
 771                }
 772
 773                if (nvme_setup_prps(dev, &prp2,
 774                                    lbas << ns->lba_shift, temp_buffer))
 775                        return -EIO;
 776                c.rw.slba = cpu_to_le64(slba);
 777                slba += lbas;
 778                c.rw.length = cpu_to_le16(lbas - 1);
 779                c.rw.prp1 = cpu_to_le64(temp_buffer);
 780                c.rw.prp2 = cpu_to_le64(prp2);
 781                status = nvme_submit_sync_cmd(dev->queues[NVME_IO_Q],
 782                                &c, NULL, IO_TIMEOUT);
 783                if (status)
 784                        break;
 785                temp_len -= (u32)lbas << ns->lba_shift;
 786                temp_buffer += lbas << ns->lba_shift;
 787        }
 788
 789        if (read)
 790                invalidate_dcache_range((unsigned long)buffer,
 791                                        (unsigned long)buffer + total_len);
 792
 793        return (total_len - temp_len) >> desc->log2blksz;
 794}
 795
 796static ulong nvme_blk_read(struct udevice *udev, lbaint_t blknr,
 797                           lbaint_t blkcnt, void *buffer)
 798{
 799        return nvme_blk_rw(udev, blknr, blkcnt, buffer, true);
 800}
 801
 802static ulong nvme_blk_write(struct udevice *udev, lbaint_t blknr,
 803                            lbaint_t blkcnt, const void *buffer)
 804{
 805        return nvme_blk_rw(udev, blknr, blkcnt, (void *)buffer, false);
 806}
 807
 808static const struct blk_ops nvme_blk_ops = {
 809        .read   = nvme_blk_read,
 810        .write  = nvme_blk_write,
 811};
 812
 813U_BOOT_DRIVER(nvme_blk) = {
 814        .name   = "nvme-blk",
 815        .id     = UCLASS_BLK,
 816        .probe  = nvme_blk_probe,
 817        .ops    = &nvme_blk_ops,
 818        .priv_auto      = sizeof(struct nvme_ns),
 819};
 820
 821static int nvme_bind(struct udevice *udev)
 822{
 823        static int ndev_num;
 824        char name[20];
 825
 826        sprintf(name, "nvme#%d", ndev_num++);
 827
 828        return device_set_name(udev, name);
 829}
 830
 831static int nvme_probe(struct udevice *udev)
 832{
 833        int ret;
 834        struct nvme_dev *ndev = dev_get_priv(udev);
 835        struct nvme_id_ns *id;
 836
 837        ndev->instance = trailing_strtol(udev->name);
 838
 839        INIT_LIST_HEAD(&ndev->namespaces);
 840        ndev->bar = dm_pci_map_bar(udev, PCI_BASE_ADDRESS_0,
 841                        PCI_REGION_MEM);
 842        if (readl(&ndev->bar->csts) == -1) {
 843                ret = -ENODEV;
 844                printf("Error: %s: Out of memory!\n", udev->name);
 845                goto free_nvme;
 846        }
 847
 848        ndev->queues = malloc(NVME_Q_NUM * sizeof(struct nvme_queue *));
 849        if (!ndev->queues) {
 850                ret = -ENOMEM;
 851                printf("Error: %s: Out of memory!\n", udev->name);
 852                goto free_nvme;
 853        }
 854        memset(ndev->queues, 0, NVME_Q_NUM * sizeof(struct nvme_queue *));
 855
 856        ndev->cap = nvme_readq(&ndev->bar->cap);
 857        ndev->q_depth = min_t(int, NVME_CAP_MQES(ndev->cap) + 1, NVME_Q_DEPTH);
 858        ndev->db_stride = 1 << NVME_CAP_STRIDE(ndev->cap);
 859        ndev->dbs = ((void __iomem *)ndev->bar) + 4096;
 860
 861        ret = nvme_configure_admin_queue(ndev);
 862        if (ret)
 863                goto free_queue;
 864
 865        /* Allocate after the page size is known */
 866        ndev->prp_pool = memalign(ndev->page_size, MAX_PRP_POOL);
 867        if (!ndev->prp_pool) {
 868                ret = -ENOMEM;
 869                printf("Error: %s: Out of memory!\n", udev->name);
 870                goto free_nvme;
 871        }
 872        ndev->prp_entry_num = MAX_PRP_POOL >> 3;
 873
 874        ret = nvme_setup_io_queues(ndev);
 875        if (ret)
 876                goto free_queue;
 877
 878        nvme_get_info_from_identify(ndev);
 879
 880        /* Create a blk device for each namespace */
 881
 882        id = memalign(ndev->page_size, sizeof(struct nvme_id_ns));
 883        if (!id) {
 884                ret = -ENOMEM;
 885                goto free_queue;
 886        }
 887
 888        for (int i = 1; i <= ndev->nn; i++) {
 889                struct udevice *ns_udev;
 890                char name[20];
 891
 892                memset(id, 0, sizeof(*id));
 893                if (nvme_identify(ndev, i, 0, (dma_addr_t)(long)id)) {
 894                        ret = -EIO;
 895                        goto free_id;
 896                }
 897
 898                /* skip inactive namespace */
 899                if (!id->nsze)
 900                        continue;
 901
 902                /*
 903                 * Encode the namespace id to the device name so that
 904                 * we can extract it when doing the probe.
 905                 */
 906                sprintf(name, "blk#%d", i);
 907
 908                /* The real blksz and size will be set by nvme_blk_probe() */
 909                ret = blk_create_devicef(udev, "nvme-blk", name, IF_TYPE_NVME,
 910                                         -1, 512, 0, &ns_udev);
 911                if (ret)
 912                        goto free_id;
 913        }
 914
 915        free(id);
 916        return 0;
 917
 918free_id:
 919        free(id);
 920free_queue:
 921        free((void *)ndev->queues);
 922free_nvme:
 923        return ret;
 924}
 925
 926U_BOOT_DRIVER(nvme) = {
 927        .name   = "nvme",
 928        .id     = UCLASS_NVME,
 929        .bind   = nvme_bind,
 930        .probe  = nvme_probe,
 931        .priv_auto      = sizeof(struct nvme_dev),
 932};
 933
 934struct pci_device_id nvme_supported[] = {
 935        { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, ~0) },
 936        {}
 937};
 938
 939U_BOOT_PCI_DEVICE(nvme, nvme_supported);
 940