linux/drivers/virtio/virtio_ring.c
<<
>>
Prefs
   1/* Virtio ring implementation.
   2 *
   3 *  Copyright 2007 Rusty Russell IBM Corporation
   4 *
   5 *  This program is free software; you can redistribute it and/or modify
   6 *  it under the terms of the GNU General Public License as published by
   7 *  the Free Software Foundation; either version 2 of the License, or
   8 *  (at your option) any later version.
   9 *
  10 *  This program is distributed in the hope that it will be useful,
  11 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 *  GNU General Public License for more details.
  14 *
  15 *  You should have received a copy of the GNU General Public License
  16 *  along with this program; if not, write to the Free Software
  17 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  18 */
  19#include <linux/virtio.h>
  20#include <linux/virtio_ring.h>
  21#include <linux/virtio_config.h>
  22#include <linux/device.h>
  23#include <linux/slab.h>
  24#include <linux/module.h>
  25#include <linux/hrtimer.h>
  26#include <linux/kmemleak.h>
  27#include <linux/dma-mapping.h>
  28#include <xen/xen.h>
  29
  30#ifdef DEBUG
  31/* For development, we want to crash whenever the ring is screwed. */
  32#define BAD_RING(_vq, fmt, args...)                             \
  33        do {                                                    \
  34                dev_err(&(_vq)->vq.vdev->dev,                   \
  35                        "%s:"fmt, (_vq)->vq.name, ##args);      \
  36                BUG();                                          \
  37        } while (0)
  38/* Caller is supposed to guarantee no reentry. */
  39#define START_USE(_vq)                                          \
  40        do {                                                    \
  41                if ((_vq)->in_use)                              \
  42                        panic("%s:in_use = %i\n",               \
  43                              (_vq)->vq.name, (_vq)->in_use);   \
  44                (_vq)->in_use = __LINE__;                       \
  45        } while (0)
  46#define END_USE(_vq) \
  47        do { BUG_ON(!(_vq)->in_use); (_vq)->in_use = 0; } while(0)
  48#else
  49#define BAD_RING(_vq, fmt, args...)                             \
  50        do {                                                    \
  51                dev_err(&_vq->vq.vdev->dev,                     \
  52                        "%s:"fmt, (_vq)->vq.name, ##args);      \
  53                (_vq)->broken = true;                           \
  54        } while (0)
  55#define START_USE(vq)
  56#define END_USE(vq)
  57#endif
  58
  59struct vring_desc_state {
  60        void *data;                     /* Data for callback. */
  61        struct vring_desc *indir_desc;  /* Indirect descriptor, if any. */
  62};
  63
  64struct vring_virtqueue {
  65        struct virtqueue vq;
  66
  67        /* Actual memory layout for this queue */
  68        struct vring vring;
  69
  70        /* Can we use weak barriers? */
  71        bool weak_barriers;
  72
  73        /* Other side has made a mess, don't try any more. */
  74        bool broken;
  75
  76        /* Host supports indirect buffers */
  77        bool indirect;
  78
  79        /* Host publishes avail event idx */
  80        bool event;
  81
  82        /* Head of free buffer list. */
  83        unsigned int free_head;
  84        /* Number we've added since last sync. */
  85        unsigned int num_added;
  86
  87        /* Last used index we've seen. */
  88        u16 last_used_idx;
  89
  90        /* Last written value to avail->flags */
  91        u16 avail_flags_shadow;
  92
  93        /* Last written value to avail->idx in guest byte order */
  94        u16 avail_idx_shadow;
  95
  96        /* How to notify other side. FIXME: commonalize hcalls! */
  97        bool (*notify)(struct virtqueue *vq);
  98
  99        /* DMA, allocation, and size information */
 100        bool we_own_ring;
 101        size_t queue_size_in_bytes;
 102        dma_addr_t queue_dma_addr;
 103
 104#ifdef DEBUG
 105        /* They're supposed to lock for us. */
 106        unsigned int in_use;
 107
 108        /* Figure out if their kicks are too delayed. */
 109        bool last_add_time_valid;
 110        ktime_t last_add_time;
 111#endif
 112
 113        /* Per-descriptor state. */
 114        struct vring_desc_state desc_state[];
 115};
 116
 117#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
 118
 119/*
 120 * The interaction between virtio and a possible IOMMU is a mess.
 121 *
 122 * On most systems with virtio, physical addresses match bus addresses,
 123 * and it doesn't particularly matter whether we use the DMA API.
 124 *
 125 * On some systems, including Xen and any system with a physical device
 126 * that speaks virtio behind a physical IOMMU, we must use the DMA API
 127 * for virtio DMA to work at all.
 128 *
 129 * On other systems, including SPARC and PPC64, virtio-pci devices are
 130 * enumerated as though they are behind an IOMMU, but the virtio host
 131 * ignores the IOMMU, so we must either pretend that the IOMMU isn't
 132 * there or somehow map everything as the identity.
 133 *
 134 * For the time being, we preserve historic behavior and bypass the DMA
 135 * API.
 136 */
 137
 138static bool vring_use_dma_api(struct virtio_device *vdev)
 139{
 140        /*
 141         * In theory, it's possible to have a buggy QEMU-supposed
 142         * emulated Q35 IOMMU and Xen enabled at the same time.  On
 143         * such a configuration, virtio has never worked and will
 144         * not work without an even larger kludge.  Instead, enable
 145         * the DMA API if we're a Xen guest, which at least allows
 146         * all of the sensible Xen configurations to work correctly.
 147         */
 148        if (xen_domain())
 149                return true;
 150
 151        return false;
 152}
 153
 154/*
 155 * The DMA ops on various arches are rather gnarly right now, and
 156 * making all of the arch DMA ops work on the vring device itself
 157 * is a mess.  For now, we use the parent device for DMA ops.
 158 */
 159struct device *vring_dma_dev(const struct vring_virtqueue *vq)
 160{
 161        return vq->vq.vdev->dev.parent;
 162}
 163
 164#if 0
 165/* Map one sg entry. */
 166static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq,
 167                                   struct scatterlist *sg,
 168                                   enum dma_data_direction direction)
 169{
 170        if (!vring_use_dma_api(vq->vq.vdev))
 171                return (dma_addr_t)sg_phys(sg);
 172
 173        /*
 174         * We can't use dma_map_sg, because we don't use scatterlists in
 175         * the way it expects (we don't guarantee that the scatterlist
 176         * will exist for the lifetime of the mapping).
 177         */
 178        return dma_map_page(vring_dma_dev(vq),
 179                            sg_page(sg), sg->offset, sg->length,
 180                            direction);
 181}
 182#endif
 183
 184static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
 185                                   void *cpu_addr, size_t size,
 186                                   enum dma_data_direction direction)
 187{
 188        if (!vring_use_dma_api(vq->vq.vdev))
 189                return (dma_addr_t)virt_to_phys(cpu_addr);
 190
 191        return dma_map_single(vring_dma_dev(vq),
 192                              cpu_addr, size, direction);
 193}
 194
 195static void vring_unmap_one(const struct vring_virtqueue *vq,
 196                            struct vring_desc *desc)
 197{
 198        u16 flags;
 199
 200        if (!vring_use_dma_api(vq->vq.vdev))
 201                return;
 202
 203        flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
 204
 205        if (flags & VRING_DESC_F_INDIRECT) {
 206                dma_unmap_single(vring_dma_dev(vq),
 207                                 virtio64_to_cpu(vq->vq.vdev, desc->addr),
 208                                 virtio32_to_cpu(vq->vq.vdev, desc->len),
 209                                 (flags & VRING_DESC_F_WRITE) ?
 210                                 DMA_FROM_DEVICE : DMA_TO_DEVICE);
 211        } else {
 212                dma_unmap_page(vring_dma_dev(vq),
 213                               virtio64_to_cpu(vq->vq.vdev, desc->addr),
 214                               virtio32_to_cpu(vq->vq.vdev, desc->len),
 215                               (flags & VRING_DESC_F_WRITE) ?
 216                               DMA_FROM_DEVICE : DMA_TO_DEVICE);
 217        }
 218}
 219
 220static int vring_mapping_error(const struct vring_virtqueue *vq,
 221                               dma_addr_t addr)
 222{
 223        if (!vring_use_dma_api(vq->vq.vdev))
 224                return 0;
 225
 226        return dma_mapping_error(vring_dma_dev(vq), addr);
 227}
 228
 229static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
 230                                         unsigned int total_sg, gfp_t gfp)
 231{
 232        struct vring_desc *desc;
 233        unsigned int i;
 234
 235        /*
 236         * We require lowmem mappings for the descriptors because
 237         * otherwise virt_to_phys will give us bogus addresses in the
 238         * virtqueue.
 239         */
 240        gfp &= ~__GFP_HIGHMEM;
 241
 242        desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
 243        if (!desc)
 244                return NULL;
 245
 246        for (i = 0; i < total_sg; i++)
 247                desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
 248        return desc;
 249}
 250
 251static inline void vring_desc_set(struct virtio_device *vdev,
 252                                  struct vring_desc *desc,
 253                                  struct scatterlist *sg,
 254                                  unsigned int flags,
 255                                  bool dma)
 256{
 257        desc->flags = cpu_to_virtio16(vdev, flags);
 258        desc->addr = cpu_to_virtio64(vdev,
 259                                     dma ? sg_dma_address(sg) : sg_phys(sg));
 260        desc->len = cpu_to_virtio32(vdev, dma ? sg_dma_len(sg) : sg->length);
 261}
 262
 263static inline int virtqueue_add(struct virtqueue *_vq,
 264                                struct scatterlist *sgs[],
 265                                unsigned int total_sg,
 266                                unsigned int out_sgs,
 267                                unsigned int in_sgs,
 268                                void *data,
 269                                gfp_t gfp,
 270                                bool dma)
 271{
 272        struct vring_virtqueue *vq = to_vvq(_vq);
 273        struct scatterlist *sg;
 274        struct vring_desc *desc;
 275        unsigned int i, n, avail, descs_used, uninitialized_var(prev), err_idx;
 276        int head;
 277        bool indirect;
 278
 279        START_USE(vq);
 280
 281        BUG_ON(data == NULL);
 282
 283        if (unlikely(vq->broken)) {
 284                END_USE(vq);
 285                return -EIO;
 286        }
 287
 288#ifdef DEBUG
 289        {
 290                ktime_t now = ktime_get();
 291
 292                /* No kick or get, with .1 second between?  Warn. */
 293                if (vq->last_add_time_valid)
 294                        WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
 295                                            > 100);
 296                vq->last_add_time = now;
 297                vq->last_add_time_valid = true;
 298        }
 299#endif
 300
 301        BUG_ON(total_sg > vq->vring.num);
 302        BUG_ON(total_sg == 0);
 303
 304        head = vq->free_head;
 305
 306        /* If the host supports indirect descriptor tables, and we have multiple
 307         * buffers, then go indirect. FIXME: tune this threshold */
 308        if (!dma && vq->indirect && total_sg > 1 && vq->vq.num_free)
 309                desc = alloc_indirect(_vq, total_sg, gfp);
 310        else
 311                desc = NULL;
 312
 313        if (desc) {
 314                /* Use a single buffer which doesn't continue */
 315                indirect = true;
 316                /* Set up rest to use this indirect table. */
 317                i = 0;
 318                descs_used = 1;
 319        } else {
 320                indirect = false;
 321                desc = vq->vring.desc;
 322                i = head;
 323                descs_used = total_sg;
 324        }
 325
 326        if (vq->vq.num_free < descs_used) {
 327                pr_debug("Can't add buf len %i - avail = %i\n",
 328                         descs_used, vq->vq.num_free);
 329                /* FIXME: for historical reasons, we force a notify here if
 330                 * there are outgoing parts to the buffer.  Presumably the
 331                 * host should service the ring ASAP. */
 332                if (out_sgs)
 333                        vq->notify(&vq->vq);
 334                END_USE(vq);
 335                return -ENOSPC;
 336        }
 337
 338        for (n = 0; n < out_sgs; n++) {
 339                for (sg = sgs[n]; sg; sg = sg_next(sg)) {
 340                        vring_desc_set(_vq->vdev, desc + i, sg,
 341                                       VRING_DESC_F_NEXT, dma);
 342                        prev = i;
 343                        i = virtio16_to_cpu(_vq->vdev, desc[i].next);
 344                }
 345        }
 346        for (; n < (out_sgs + in_sgs); n++) {
 347                for (sg = sgs[n]; sg; sg = sg_next(sg)) {
 348                        vring_desc_set(_vq->vdev, desc + i, sg,
 349                                       VRING_DESC_F_NEXT | VRING_DESC_F_WRITE,
 350                                       dma);
 351                        prev = i;
 352                        i = virtio16_to_cpu(_vq->vdev, desc[i].next);
 353                }
 354        }
 355        /* Last one doesn't continue. */
 356        desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
 357
 358        if (indirect) {
 359                /* Now that the indirect table is filled in, map it. */
 360                dma_addr_t addr = vring_map_single(
 361                        vq, desc, total_sg * sizeof(struct vring_desc),
 362                        DMA_TO_DEVICE);
 363                if (vring_mapping_error(vq, addr))
 364                        goto unmap_release;
 365
 366                vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT);
 367                vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);
 368
 369                vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc));
 370        }
 371
 372        /* We're using some buffers from the free list. */
 373        vq->vq.num_free -= descs_used;
 374
 375        /* Update free pointer */
 376        if (indirect)
 377                vq->free_head = virtio16_to_cpu(_vq->vdev, vq->vring.desc[head].next);
 378        else
 379                vq->free_head = i;
 380
 381        /* Store token and indirect buffer state. */
 382        vq->desc_state[head].data = data;
 383        if (indirect)
 384                vq->desc_state[head].indir_desc = desc;
 385
 386        /* Put entry in available array (but don't update avail->idx until they
 387         * do sync). */
 388        avail = vq->avail_idx_shadow & (vq->vring.num - 1);
 389        vq->vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
 390
 391        /* Descriptors and available array need to be set before we expose the
 392         * new available array entries. */
 393        virtio_wmb(vq->weak_barriers);
 394        vq->avail_idx_shadow++;
 395        vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
 396        vq->num_added++;
 397
 398        pr_debug("Added buffer head %i to %p\n", head, vq);
 399        END_USE(vq);
 400
 401        /* This is very unlikely, but theoretically possible.  Kick
 402         * just in case. */
 403        if (unlikely(vq->num_added == (1 << 16) - 1))
 404                virtqueue_kick(_vq);
 405
 406        return 0;
 407
 408unmap_release:
 409        err_idx = i;
 410        i = head;
 411
 412        for (n = 0; n < total_sg; n++) {
 413                if (i == err_idx)
 414                        break;
 415                vring_unmap_one(vq, &desc[i]);
 416                i = vq->vring.desc[i].next;
 417        }
 418
 419        vq->vq.num_free += total_sg;
 420
 421        if (indirect)
 422                kfree(desc);
 423
 424        return -EIO;
 425}
 426
 427/**
 428 * virtqueue_add_sgs - expose buffers to other end
 429 * @vq: the struct virtqueue we're talking about.
 430 * @sgs: array of terminated scatterlists.
 431 * @out_num: the number of scatterlists readable by other side
 432 * @in_num: the number of scatterlists which are writable (after readable ones)
 433 * @data: the token identifying the buffer.
 434 * @gfp: how to do memory allocations (if necessary).
 435 * @dma: Use DMA mapped scatterlists. (Only for remoteproc/rpmsg).
 436 *
 437 * Caller must ensure we don't call this with other virtqueue operations
 438 * at the same time (except where noted).
 439 *
 440 * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
 441 */
 442int __virtqueue_add_sgs(struct virtqueue *_vq,
 443                        struct scatterlist *sgs[],
 444                        unsigned int out_sgs,
 445                        unsigned int in_sgs,
 446                        void *data,
 447                        gfp_t gfp,
 448                        bool dma)
 449{
 450        unsigned int i, total_sg = 0;
 451
 452        /* Count them first. */
 453        for (i = 0; i < out_sgs + in_sgs; i++) {
 454                struct scatterlist *sg;
 455                for (sg = sgs[i]; sg; sg = sg_next(sg))
 456                        total_sg++;
 457        }
 458        return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs, data, gfp,
 459                             dma);
 460}
 461EXPORT_SYMBOL_GPL(__virtqueue_add_sgs);
 462
 463int virtqueue_add_sgs(struct virtqueue *_vq,
 464                      struct scatterlist *sgs[],
 465                      unsigned int out_sgs,
 466                      unsigned int in_sgs,
 467                      void *data,
 468                      gfp_t gfp)
 469{
 470        return __virtqueue_add_sgs(_vq, sgs, out_sgs, in_sgs, data, gfp, false);
 471}
 472EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
 473
 474/**
 475 * virtqueue_add_outbuf - expose output buffers to other end
 476 * @vq: the struct virtqueue we're talking about.
 477 * @sg: scatterlist (must be well-formed and terminated!)
 478 * @num: the number of entries in @sg readable by other side
 479 * @data: the token identifying the buffer.
 480 * @gfp: how to do memory allocations (if necessary).
 481 *
 482 * Caller must ensure we don't call this with other virtqueue operations
 483 * at the same time (except where noted).
 484 *
 485 * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
 486 */
 487int virtqueue_add_outbuf(struct virtqueue *vq,
 488                         struct scatterlist *sg, unsigned int num,
 489                         void *data,
 490                         gfp_t gfp)
 491{
 492        return virtqueue_add(vq, &sg, num, 1, 0, data, gfp, false);
 493}
 494EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
 495
 496/**
 497 * virtqueue_add_inbuf - expose input buffers to other end
 498 * @vq: the struct virtqueue we're talking about.
 499 * @sg: scatterlist (must be well-formed and terminated!)
 500 * @num: the number of entries in @sg writable by other side
 501 * @data: the token identifying the buffer.
 502 * @gfp: how to do memory allocations (if necessary).
 503 *
 504 * Caller must ensure we don't call this with other virtqueue operations
 505 * at the same time (except where noted).
 506 *
 507 * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
 508 */
 509int virtqueue_add_inbuf(struct virtqueue *vq,
 510                        struct scatterlist *sg, unsigned int num,
 511                        void *data,
 512                        gfp_t gfp)
 513{
 514        return virtqueue_add(vq, &sg, num, 0, 1, data, gfp, false);
 515}
 516EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
 517
 518/**
 519 * virtqueue_kick_prepare - first half of split virtqueue_kick call.
 520 * @vq: the struct virtqueue
 521 *
 522 * Instead of virtqueue_kick(), you can do:
 523 *      if (virtqueue_kick_prepare(vq))
 524 *              virtqueue_notify(vq);
 525 *
 526 * This is sometimes useful because the virtqueue_kick_prepare() needs
 527 * to be serialized, but the actual virtqueue_notify() call does not.
 528 */
 529bool virtqueue_kick_prepare(struct virtqueue *_vq)
 530{
 531        struct vring_virtqueue *vq = to_vvq(_vq);
 532        u16 new, old;
 533        bool needs_kick;
 534
 535        START_USE(vq);
 536        /* We need to expose available array entries before checking avail
 537         * event. */
 538        virtio_mb(vq->weak_barriers);
 539
 540        old = vq->avail_idx_shadow - vq->num_added;
 541        new = vq->avail_idx_shadow;
 542        vq->num_added = 0;
 543
 544#ifdef DEBUG
 545        if (vq->last_add_time_valid) {
 546                WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
 547                                              vq->last_add_time)) > 100);
 548        }
 549        vq->last_add_time_valid = false;
 550#endif
 551
 552        if (vq->event) {
 553                needs_kick = vring_need_event(virtio16_to_cpu(_vq->vdev, vring_avail_event(&vq->vring)),
 554                                              new, old);
 555        } else {
 556                needs_kick = !(vq->vring.used->flags & cpu_to_virtio16(_vq->vdev, VRING_USED_F_NO_NOTIFY));
 557        }
 558        END_USE(vq);
 559        return needs_kick;
 560}
 561EXPORT_SYMBOL_GPL(virtqueue_kick_prepare);
 562
 563/**
 564 * virtqueue_notify - second half of split virtqueue_kick call.
 565 * @vq: the struct virtqueue
 566 *
 567 * This does not need to be serialized.
 568 *
 569 * Returns false if host notify failed or queue is broken, otherwise true.
 570 */
 571bool virtqueue_notify(struct virtqueue *_vq)
 572{
 573        struct vring_virtqueue *vq = to_vvq(_vq);
 574
 575        if (unlikely(vq->broken))
 576                return false;
 577
 578        /* Prod other side to tell it about changes. */
 579        if (!vq->notify(_vq)) {
 580                vq->broken = true;
 581                return false;
 582        }
 583        return true;
 584}
 585EXPORT_SYMBOL_GPL(virtqueue_notify);
 586
 587/**
 588 * virtqueue_kick - update after add_buf
 589 * @vq: the struct virtqueue
 590 *
 591 * After one or more virtqueue_add_* calls, invoke this to kick
 592 * the other side.
 593 *
 594 * Caller must ensure we don't call this with other virtqueue
 595 * operations at the same time (except where noted).
 596 *
 597 * Returns false if kick failed, otherwise true.
 598 */
 599bool virtqueue_kick(struct virtqueue *vq)
 600{
 601        if (virtqueue_kick_prepare(vq))
 602                return virtqueue_notify(vq);
 603        return true;
 604}
 605EXPORT_SYMBOL_GPL(virtqueue_kick);
 606
 607static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
 608{
 609        unsigned int i, j;
 610        u16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
 611
 612        /* Clear data ptr. */
 613        vq->desc_state[head].data = NULL;
 614
 615        /* Put back on free list: unmap first-level descriptors and find end */
 616        i = head;
 617
 618        while (vq->vring.desc[i].flags & nextflag) {
 619                vring_unmap_one(vq, &vq->vring.desc[i]);
 620                i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
 621                vq->vq.num_free++;
 622        }
 623
 624        vring_unmap_one(vq, &vq->vring.desc[i]);
 625        vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
 626        vq->free_head = head;
 627
 628        /* Plus final descriptor */
 629        vq->vq.num_free++;
 630
 631        /* Free the indirect table, if any, now that it's unmapped. */
 632        if (vq->desc_state[head].indir_desc) {
 633                struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
 634                u32 len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len);
 635
 636                BUG_ON(!(vq->vring.desc[head].flags &
 637                         cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
 638                BUG_ON(len == 0 || len % sizeof(struct vring_desc));
 639
 640                for (j = 0; j < len / sizeof(struct vring_desc); j++)
 641                        vring_unmap_one(vq, &indir_desc[j]);
 642
 643                kfree(vq->desc_state[head].indir_desc);
 644                vq->desc_state[head].indir_desc = NULL;
 645        }
 646}
 647
 648static inline bool more_used(const struct vring_virtqueue *vq)
 649{
 650        return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
 651}
 652
 653/**
 654 * virtqueue_get_buf - get the next used buffer
 655 * @vq: the struct virtqueue we're talking about.
 656 * @len: the length written into the buffer
 657 *
 658 * If the driver wrote data into the buffer, @len will be set to the
 659 * amount written.  This means you don't need to clear the buffer
 660 * beforehand to ensure there's no data leakage in the case of short
 661 * writes.
 662 *
 663 * Caller must ensure we don't call this with other virtqueue
 664 * operations at the same time (except where noted).
 665 *
 666 * Returns NULL if there are no used buffers, or the "data" token
 667 * handed to virtqueue_add_*().
 668 */
 669void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
 670{
 671        struct vring_virtqueue *vq = to_vvq(_vq);
 672        void *ret;
 673        unsigned int i;
 674        u16 last_used;
 675
 676        START_USE(vq);
 677
 678        if (unlikely(vq->broken)) {
 679                END_USE(vq);
 680                return NULL;
 681        }
 682
 683        if (!more_used(vq)) {
 684                pr_debug("No more buffers in queue\n");
 685                END_USE(vq);
 686                return NULL;
 687        }
 688
 689        /* Only get used array entries after they have been exposed by host. */
 690        virtio_rmb(vq->weak_barriers);
 691
 692        last_used = (vq->last_used_idx & (vq->vring.num - 1));
 693        i = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].id);
 694        *len = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].len);
 695
 696        if (unlikely(i >= vq->vring.num)) {
 697                BAD_RING(vq, "id %u out of range\n", i);
 698                return NULL;
 699        }
 700        if (unlikely(!vq->desc_state[i].data)) {
 701                BAD_RING(vq, "id %u is not a head!\n", i);
 702                return NULL;
 703        }
 704
 705        /* detach_buf clears data, so grab it now. */
 706        ret = vq->desc_state[i].data;
 707        detach_buf(vq, i);
 708        vq->last_used_idx++;
 709        /* If we expect an interrupt for the next entry, tell host
 710         * by writing event index and flush out the write before
 711         * the read in the next get_buf call. */
 712        if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
 713                virtio_store_mb(vq->weak_barriers,
 714                                &vring_used_event(&vq->vring),
 715                                cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
 716
 717#ifdef DEBUG
 718        vq->last_add_time_valid = false;
 719#endif
 720
 721        END_USE(vq);
 722        return ret;
 723}
 724EXPORT_SYMBOL_GPL(virtqueue_get_buf);
 725
 726/**
 727 * virtqueue_disable_cb - disable callbacks
 728 * @vq: the struct virtqueue we're talking about.
 729 *
 730 * Note that this is not necessarily synchronous, hence unreliable and only
 731 * useful as an optimization.
 732 *
 733 * Unlike other operations, this need not be serialized.
 734 */
 735void virtqueue_disable_cb(struct virtqueue *_vq)
 736{
 737        struct vring_virtqueue *vq = to_vvq(_vq);
 738
 739        if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
 740                vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
 741                vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
 742        }
 743
 744}
 745EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
 746
 747/**
 748 * virtqueue_enable_cb_prepare - restart callbacks after disable_cb
 749 * @vq: the struct virtqueue we're talking about.
 750 *
 751 * This re-enables callbacks; it returns current queue state
 752 * in an opaque unsigned value. This value should be later tested by
 753 * virtqueue_poll, to detect a possible race between the driver checking for
 754 * more work, and enabling callbacks.
 755 *
 756 * Caller must ensure we don't call this with other virtqueue
 757 * operations at the same time (except where noted).
 758 */
 759unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
 760{
 761        struct vring_virtqueue *vq = to_vvq(_vq);
 762        u16 last_used_idx;
 763
 764        START_USE(vq);
 765
 766        /* We optimistically turn back on interrupts, then check if there was
 767         * more to do. */
 768        /* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
 769         * either clear the flags bit or point the event index at the next
 770         * entry. Always do both to keep code simple. */
 771        if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
 772                vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
 773                vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
 774        }
 775        vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
 776        END_USE(vq);
 777        return last_used_idx;
 778}
 779EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare);
 780
 781/**
 782 * virtqueue_poll - query pending used buffers
 783 * @vq: the struct virtqueue we're talking about.
 784 * @last_used_idx: virtqueue state (from call to virtqueue_enable_cb_prepare).
 785 *
 786 * Returns "true" if there are pending used buffers in the queue.
 787 *
 788 * This does not need to be serialized.
 789 */
 790bool virtqueue_poll(struct virtqueue *_vq, unsigned last_used_idx)
 791{
 792        struct vring_virtqueue *vq = to_vvq(_vq);
 793
 794        virtio_mb(vq->weak_barriers);
 795        return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
 796}
 797EXPORT_SYMBOL_GPL(virtqueue_poll);
 798
 799/**
 800 * virtqueue_enable_cb - restart callbacks after disable_cb.
 801 * @vq: the struct virtqueue we're talking about.
 802 *
 803 * This re-enables callbacks; it returns "false" if there are pending
 804 * buffers in the queue, to detect a possible race between the driver
 805 * checking for more work, and enabling callbacks.
 806 *
 807 * Caller must ensure we don't call this with other virtqueue
 808 * operations at the same time (except where noted).
 809 */
 810bool virtqueue_enable_cb(struct virtqueue *_vq)
 811{
 812        unsigned last_used_idx = virtqueue_enable_cb_prepare(_vq);
 813        return !virtqueue_poll(_vq, last_used_idx);
 814}
 815EXPORT_SYMBOL_GPL(virtqueue_enable_cb);
 816
 817/**
 818 * virtqueue_enable_cb_delayed - restart callbacks after disable_cb.
 819 * @vq: the struct virtqueue we're talking about.
 820 *
 821 * This re-enables callbacks but hints to the other side to delay
 822 * interrupts until most of the available buffers have been processed;
 823 * it returns "false" if there are many pending buffers in the queue,
 824 * to detect a possible race between the driver checking for more work,
 825 * and enabling callbacks.
 826 *
 827 * Caller must ensure we don't call this with other virtqueue
 828 * operations at the same time (except where noted).
 829 */
 830bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
 831{
 832        struct vring_virtqueue *vq = to_vvq(_vq);
 833        u16 bufs;
 834
 835        START_USE(vq);
 836
 837        /* We optimistically turn back on interrupts, then check if there was
 838         * more to do. */
 839        /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
 840         * either clear the flags bit or point the event index at the next
 841         * entry. Always do both to keep code simple. */
 842        if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
 843                vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
 844                vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
 845        }
 846        /* TODO: tune this threshold */
 847        bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
 848
 849        virtio_store_mb(vq->weak_barriers,
 850                        &vring_used_event(&vq->vring),
 851                        cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs));
 852
 853        if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
 854                END_USE(vq);
 855                return false;
 856        }
 857
 858        END_USE(vq);
 859        return true;
 860}
 861EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
 862
 863/**
 864 * virtqueue_detach_unused_buf - detach first unused buffer
 865 * @vq: the struct virtqueue we're talking about.
 866 *
 867 * Returns NULL or the "data" token handed to virtqueue_add_*().
 868 * This is not valid on an active queue; it is useful only for device
 869 * shutdown.
 870 */
 871void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
 872{
 873        struct vring_virtqueue *vq = to_vvq(_vq);
 874        unsigned int i;
 875        void *buf;
 876
 877        START_USE(vq);
 878
 879        for (i = 0; i < vq->vring.num; i++) {
 880                if (!vq->desc_state[i].data)
 881                        continue;
 882                /* detach_buf clears data, so grab it now. */
 883                buf = vq->desc_state[i].data;
 884                detach_buf(vq, i);
 885                vq->avail_idx_shadow--;
 886                vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
 887                END_USE(vq);
 888                return buf;
 889        }
 890        /* That should have freed everything. */
 891        BUG_ON(vq->vq.num_free != vq->vring.num);
 892
 893        END_USE(vq);
 894        return NULL;
 895}
 896EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf);
 897
 898irqreturn_t vring_interrupt(int irq, void *_vq)
 899{
 900        struct vring_virtqueue *vq = to_vvq(_vq);
 901
 902        if (!more_used(vq)) {
 903                pr_debug("virtqueue interrupt with no work for %p\n", vq);
 904                return IRQ_NONE;
 905        }
 906
 907        if (unlikely(vq->broken))
 908                return IRQ_HANDLED;
 909
 910        pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback);
 911        if (vq->vq.callback)
 912                vq->vq.callback(&vq->vq);
 913
 914        return IRQ_HANDLED;
 915}
 916EXPORT_SYMBOL_GPL(vring_interrupt);
 917
 918struct virtqueue *__vring_new_virtqueue(unsigned int index,
 919                                        struct vring vring,
 920                                        struct virtio_device *vdev,
 921                                        bool weak_barriers,
 922                                        bool (*notify)(struct virtqueue *),
 923                                        void (*callback)(struct virtqueue *),
 924                                        const char *name)
 925{
 926        unsigned int i;
 927        struct vring_virtqueue *vq;
 928
 929        vq = kmalloc(sizeof(*vq) + vring.num * sizeof(struct vring_desc_state),
 930                     GFP_KERNEL);
 931        if (!vq)
 932                return NULL;
 933
 934        vq->vring = vring;
 935        vq->vq.callback = callback;
 936        vq->vq.vdev = vdev;
 937        vq->vq.name = name;
 938        vq->vq.num_free = vring.num;
 939        vq->vq.index = index;
 940        vq->we_own_ring = false;
 941        vq->queue_dma_addr = 0;
 942        vq->queue_size_in_bytes = 0;
 943        vq->notify = notify;
 944        vq->weak_barriers = weak_barriers;
 945        vq->broken = false;
 946        vq->last_used_idx = 0;
 947        vq->avail_flags_shadow = 0;
 948        vq->avail_idx_shadow = 0;
 949        vq->num_added = 0;
 950        list_add_tail(&vq->vq.list, &vdev->vqs);
 951#ifdef DEBUG
 952        vq->in_use = false;
 953        vq->last_add_time_valid = false;
 954#endif
 955
 956        vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC);
 957        vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
 958
 959        /* No callback?  Tell other side not to bother us. */
 960        if (!callback) {
 961                vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
 962                vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow);
 963        }
 964
 965        /* Put everything in free lists. */
 966        vq->free_head = 0;
 967        for (i = 0; i < vring.num-1; i++)
 968                vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
 969        memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state));
 970
 971        return &vq->vq;
 972}
 973EXPORT_SYMBOL_GPL(__vring_new_virtqueue);
 974
 975static void *vring_alloc_queue(struct virtio_device *vdev, size_t size,
 976                              dma_addr_t *dma_handle, gfp_t flag)
 977{
 978        if (vring_use_dma_api(vdev)) {
 979                return dma_alloc_coherent(vdev->dev.parent, size,
 980                                          dma_handle, flag);
 981        } else {
 982                void *queue = alloc_pages_exact(PAGE_ALIGN(size), flag);
 983                if (queue) {
 984                        phys_addr_t phys_addr = virt_to_phys(queue);
 985                        *dma_handle = (dma_addr_t)phys_addr;
 986
 987                        /*
 988                         * Sanity check: make sure we dind't truncate
 989                         * the address.  The only arches I can find that
 990                         * have 64-bit phys_addr_t but 32-bit dma_addr_t
 991                         * are certain non-highmem MIPS and x86
 992                         * configurations, but these configurations
 993                         * should never allocate physical pages above 32
 994                         * bits, so this is fine.  Just in case, throw a
 995                         * warning and abort if we end up with an
 996                         * unrepresentable address.
 997                         */
 998                        if (WARN_ON_ONCE(*dma_handle != phys_addr)) {
 999                                free_pages_exact(queue, PAGE_ALIGN(size));
1000                                return NULL;
1001                        }
1002                }
1003                return queue;
1004        }
1005}
1006
1007static void vring_free_queue(struct virtio_device *vdev, size_t size,
1008                             void *queue, dma_addr_t dma_handle)
1009{
1010        if (vring_use_dma_api(vdev)) {
1011                dma_free_coherent(vdev->dev.parent, size, queue, dma_handle);
1012        } else {
1013                free_pages_exact(queue, PAGE_ALIGN(size));
1014        }
1015}
1016
1017struct virtqueue *vring_create_virtqueue(
1018        unsigned int index,
1019        unsigned int num,
1020        unsigned int vring_align,
1021        struct virtio_device *vdev,
1022        bool weak_barriers,
1023        bool may_reduce_num,
1024        bool (*notify)(struct virtqueue *),
1025        void (*callback)(struct virtqueue *),
1026        const char *name)
1027{
1028        struct virtqueue *vq;
1029        void *queue = NULL;
1030        dma_addr_t dma_addr;
1031        size_t queue_size_in_bytes;
1032        struct vring vring;
1033
1034        /* We assume num is a power of 2. */
1035        if (num & (num - 1)) {
1036                dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num);
1037                return NULL;
1038        }
1039
1040        /* TODO: allocate each queue chunk individually */
1041        for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) {
1042                queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
1043                                          &dma_addr,
1044                                          GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO);
1045                if (queue)
1046                        break;
1047        }
1048
1049        if (!num)
1050                return NULL;
1051
1052        if (!queue) {
1053                /* Try to get a single page. You are my only hope! */
1054                queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
1055                                          &dma_addr, GFP_KERNEL|__GFP_ZERO);
1056        }
1057        if (!queue)
1058                return NULL;
1059
1060        queue_size_in_bytes = vring_size(num, vring_align);
1061        vring_init(&vring, num, queue, vring_align);
1062
1063        vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers,
1064                                   notify, callback, name);
1065        if (!vq) {
1066                vring_free_queue(vdev, queue_size_in_bytes, queue,
1067                                 dma_addr);
1068                return NULL;
1069        }
1070
1071        to_vvq(vq)->queue_dma_addr = dma_addr;
1072        to_vvq(vq)->queue_size_in_bytes = queue_size_in_bytes;
1073        to_vvq(vq)->we_own_ring = true;
1074
1075        return vq;
1076}
1077EXPORT_SYMBOL_GPL(vring_create_virtqueue);
1078
1079struct virtqueue *vring_new_virtqueue(unsigned int index,
1080                                      unsigned int num,
1081                                      unsigned int vring_align,
1082                                      struct virtio_device *vdev,
1083                                      bool weak_barriers,
1084                                      void *pages,
1085                                      bool (*notify)(struct virtqueue *vq),
1086                                      void (*callback)(struct virtqueue *vq),
1087                                      const char *name)
1088{
1089        struct vring vring;
1090        vring_init(&vring, num, pages, vring_align);
1091        return __vring_new_virtqueue(index, vring, vdev, weak_barriers,
1092                                     notify, callback, name);
1093}
1094EXPORT_SYMBOL_GPL(vring_new_virtqueue);
1095
1096void vring_del_virtqueue(struct virtqueue *_vq)
1097{
1098        struct vring_virtqueue *vq = to_vvq(_vq);
1099
1100        if (vq->we_own_ring) {
1101                vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes,
1102                                 vq->vring.desc, vq->queue_dma_addr);
1103        }
1104        list_del(&_vq->list);
1105        kfree(vq);
1106}
1107EXPORT_SYMBOL_GPL(vring_del_virtqueue);
1108
1109/* Manipulates transport-specific feature bits. */
1110void vring_transport_features(struct virtio_device *vdev)
1111{
1112        unsigned int i;
1113
1114        for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
1115                switch (i) {
1116                case VIRTIO_RING_F_INDIRECT_DESC:
1117                        break;
1118                case VIRTIO_RING_F_EVENT_IDX:
1119                        break;
1120                case VIRTIO_F_VERSION_1:
1121                        break;
1122                default:
1123                        /* We don't understand this bit. */
1124                        __virtio_clear_bit(vdev, i);
1125                }
1126        }
1127}
1128EXPORT_SYMBOL_GPL(vring_transport_features);
1129
1130/**
1131 * virtqueue_get_vring_size - return the size of the virtqueue's vring
1132 * @vq: the struct virtqueue containing the vring of interest.
1133 *
1134 * Returns the size of the vring.  This is mainly used for boasting to
1135 * userspace.  Unlike other operations, this need not be serialized.
1136 */
1137unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
1138{
1139
1140        struct vring_virtqueue *vq = to_vvq(_vq);
1141
1142        return vq->vring.num;
1143}
1144EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
1145
1146bool virtqueue_is_broken(struct virtqueue *_vq)
1147{
1148        struct vring_virtqueue *vq = to_vvq(_vq);
1149
1150        return vq->broken;
1151}
1152EXPORT_SYMBOL_GPL(virtqueue_is_broken);
1153
1154/*
1155 * This should prevent the device from being used, allowing drivers to
1156 * recover.  You may need to grab appropriate locks to flush.
1157 */
1158void virtio_break_device(struct virtio_device *dev)
1159{
1160        struct virtqueue *_vq;
1161
1162        list_for_each_entry(_vq, &dev->vqs, list) {
1163                struct vring_virtqueue *vq = to_vvq(_vq);
1164                vq->broken = true;
1165        }
1166}
1167EXPORT_SYMBOL_GPL(virtio_break_device);
1168
1169dma_addr_t virtqueue_get_desc_addr(struct virtqueue *_vq)
1170{
1171        struct vring_virtqueue *vq = to_vvq(_vq);
1172
1173        BUG_ON(!vq->we_own_ring);
1174
1175        return vq->queue_dma_addr;
1176}
1177EXPORT_SYMBOL_GPL(virtqueue_get_desc_addr);
1178
1179dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
1180{
1181        struct vring_virtqueue *vq = to_vvq(_vq);
1182
1183        BUG_ON(!vq->we_own_ring);
1184
1185        return vq->queue_dma_addr +
1186                ((char *)vq->vring.avail - (char *)vq->vring.desc);
1187}
1188EXPORT_SYMBOL_GPL(virtqueue_get_avail_addr);
1189
1190dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
1191{
1192        struct vring_virtqueue *vq = to_vvq(_vq);
1193
1194        BUG_ON(!vq->we_own_ring);
1195
1196        return vq->queue_dma_addr +
1197                ((char *)vq->vring.used - (char *)vq->vring.desc);
1198}
1199EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);
1200
1201const struct vring *virtqueue_get_vring(struct virtqueue *vq)
1202{
1203        return &to_vvq(vq)->vring;
1204}
1205EXPORT_SYMBOL_GPL(virtqueue_get_vring);
1206
1207MODULE_LICENSE("GPL");
1208