linux/drivers/virtio/virtio_ring.c
<<
>>
Prefs
   1/* Virtio ring implementation.
   2 *
   3 *  Copyright 2007 Rusty Russell IBM Corporation
   4 *
   5 *  This program is free software; you can redistribute it and/or modify
   6 *  it under the terms of the GNU General Public License as published by
   7 *  the Free Software Foundation; either version 2 of the License, or
   8 *  (at your option) any later version.
   9 *
  10 *  This program is distributed in the hope that it will be useful,
  11 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 *  GNU General Public License for more details.
  14 *
  15 *  You should have received a copy of the GNU General Public License
  16 *  along with this program; if not, write to the Free Software
  17 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  18 */
  19#include <linux/virtio.h>
  20#include <linux/virtio_ring.h>
  21#include <linux/virtio_config.h>
  22#include <linux/device.h>
  23#include <linux/slab.h>
  24#include <linux/module.h>
  25#include <linux/hrtimer.h>
  26#include <linux/kmemleak.h>
  27#include <linux/dma-mapping.h>
  28
  29#ifdef DEBUG
  30/* For development, we want to crash whenever the ring is screwed. */
  31#define BAD_RING(_vq, fmt, args...)                             \
  32        do {                                                    \
  33                dev_err(&(_vq)->vq.vdev->dev,                   \
  34                        "%s:"fmt, (_vq)->vq.name, ##args);      \
  35                BUG();                                          \
  36        } while (0)
  37/* Caller is supposed to guarantee no reentry. */
  38#define START_USE(_vq)                                          \
  39        do {                                                    \
  40                if ((_vq)->in_use)                              \
  41                        panic("%s:in_use = %i\n",               \
  42                              (_vq)->vq.name, (_vq)->in_use);   \
  43                (_vq)->in_use = __LINE__;                       \
  44        } while (0)
  45#define END_USE(_vq) \
  46        do { BUG_ON(!(_vq)->in_use); (_vq)->in_use = 0; } while(0)
  47#else
  48#define BAD_RING(_vq, fmt, args...)                             \
  49        do {                                                    \
  50                dev_err(&_vq->vq.vdev->dev,                     \
  51                        "%s:"fmt, (_vq)->vq.name, ##args);      \
  52                (_vq)->broken = true;                           \
  53        } while (0)
  54#define START_USE(vq)
  55#define END_USE(vq)
  56#endif
  57
  58struct vring_desc_state {
  59        void *data;                     /* Data for callback. */
  60        struct vring_desc *indir_desc;  /* Indirect descriptor, if any. */
  61};
  62
  63struct vring_virtqueue {
  64        struct virtqueue vq;
  65
  66        /* Actual memory layout for this queue */
  67        struct vring vring;
  68
  69        /* Can we use weak barriers? */
  70        bool weak_barriers;
  71
  72        /* Other side has made a mess, don't try any more. */
  73        bool broken;
  74
  75        /* Host supports indirect buffers */
  76        bool indirect;
  77
  78        /* Host publishes avail event idx */
  79        bool event;
  80
  81        /* Head of free buffer list. */
  82        unsigned int free_head;
  83        /* Number we've added since last sync. */
  84        unsigned int num_added;
  85
  86        /* Last used index we've seen. */
  87        u16 last_used_idx;
  88
  89        /* How to notify other side. FIXME: commonalize hcalls! */
  90        bool (*notify)(struct virtqueue *vq);
  91
  92        /* DMA, allocation, and size information */
  93        bool we_own_ring;
  94        size_t queue_size_in_bytes;
  95        dma_addr_t queue_dma_addr;
  96
  97#ifdef DEBUG
  98        /* They're supposed to lock for us. */
  99        unsigned int in_use;
 100
 101        /* Figure out if their kicks are too delayed. */
 102        bool last_add_time_valid;
 103        ktime_t last_add_time;
 104#endif
 105
 106        /* Per-descriptor state. */
 107        struct vring_desc_state desc_state[];
 108};
 109
 110#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
 111
 112/*
 113 * Modern virtio devices have feature bits to specify whether they need a
 114 * quirk and bypass the IOMMU. If not there, just use the DMA API.
 115 *
 116 * If there, the interaction between virtio and DMA API is messy.
 117 *
 118 * On most systems with virtio, physical addresses match bus addresses,
 119 * and it doesn't particularly matter whether we use the DMA API.
 120 *
 121 * On some systems, including Xen and any system with a physical device
 122 * that speaks virtio behind a physical IOMMU, we must use the DMA API
 123 * for virtio DMA to work at all.
 124 *
 125 * On other systems, including SPARC and PPC64, virtio-pci devices are
 126 * enumerated as though they are behind an IOMMU, but the virtio host
 127 * ignores the IOMMU, so we must either pretend that the IOMMU isn't
 128 * there or somehow map everything as the identity.
 129 *
 130 * For the time being, we preserve historic behavior and bypass the DMA
 131 * API.
 132 *
 133 * TODO: install a per-device DMA ops structure that does the right thing
 134 * taking into account all the above quirks, and use the DMA API
 135 * unconditionally on data path.
 136 */
 137
 138static bool vring_use_dma_api(struct virtio_device *vdev)
 139{
 140        if (!virtio_has_iommu_quirk(vdev))
 141                return true;
 142
 143        return false;
 144}
 145
 146size_t virtio_max_dma_size(struct virtio_device *vdev)
 147{
 148        size_t max_segment_size = SIZE_MAX;
 149
 150        if (vring_use_dma_api(vdev))
 151                max_segment_size = dma_max_mapping_size(&vdev->dev);
 152
 153        return max_segment_size;
 154}
 155EXPORT_SYMBOL_GPL(virtio_max_dma_size);
 156
 157/*
 158 * The DMA ops on various arches are rather gnarly right now, and
 159 * making all of the arch DMA ops work on the vring device itself
 160 * is a mess.  For now, we use the parent device for DMA ops.
 161 */
 162struct device *vring_dma_dev(const struct vring_virtqueue *vq)
 163{
 164        return vq->vq.vdev->dev.parent;
 165}
 166
 167/* Map one sg entry. */
 168static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq,
 169                                   struct scatterlist *sg,
 170                                   enum dma_data_direction direction)
 171{
 172        if (!vring_use_dma_api(vq->vq.vdev))
 173                return (dma_addr_t)sg_phys(sg);
 174
 175        /*
 176         * We can't use dma_map_sg, because we don't use scatterlists in
 177         * the way it expects (we don't guarantee that the scatterlist
 178         * will exist for the lifetime of the mapping).
 179         */
 180        return dma_map_page(vring_dma_dev(vq),
 181                            sg_page(sg), sg->offset, sg->length,
 182                            direction);
 183}
 184
 185static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
 186                                   void *cpu_addr, size_t size,
 187                                   enum dma_data_direction direction)
 188{
 189        if (!vring_use_dma_api(vq->vq.vdev))
 190                return (dma_addr_t)virt_to_phys(cpu_addr);
 191
 192        return dma_map_single(vring_dma_dev(vq),
 193                              cpu_addr, size, direction);
 194}
 195
 196static void vring_unmap_one(const struct vring_virtqueue *vq,
 197                            struct vring_desc *desc)
 198{
 199        u16 flags;
 200
 201        if (!vring_use_dma_api(vq->vq.vdev))
 202                return;
 203
 204        flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
 205
 206        if (flags & VRING_DESC_F_INDIRECT) {
 207                dma_unmap_single(vring_dma_dev(vq),
 208                                 virtio64_to_cpu(vq->vq.vdev, desc->addr),
 209                                 virtio32_to_cpu(vq->vq.vdev, desc->len),
 210                                 (flags & VRING_DESC_F_WRITE) ?
 211                                 DMA_FROM_DEVICE : DMA_TO_DEVICE);
 212        } else {
 213                dma_unmap_page(vring_dma_dev(vq),
 214                               virtio64_to_cpu(vq->vq.vdev, desc->addr),
 215                               virtio32_to_cpu(vq->vq.vdev, desc->len),
 216                               (flags & VRING_DESC_F_WRITE) ?
 217                               DMA_FROM_DEVICE : DMA_TO_DEVICE);
 218        }
 219}
 220
 221static int vring_mapping_error(const struct vring_virtqueue *vq,
 222                               dma_addr_t addr)
 223{
 224        if (!vring_use_dma_api(vq->vq.vdev))
 225                return 0;
 226
 227        return dma_mapping_error(vring_dma_dev(vq), addr);
 228}
 229
 230static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
 231                                         unsigned int total_sg, gfp_t gfp)
 232{
 233        struct vring_desc *desc;
 234        unsigned int i;
 235
 236        /*
 237         * We require lowmem mappings for the descriptors because
 238         * otherwise virt_to_phys will give us bogus addresses in the
 239         * virtqueue.
 240         */
 241        gfp &= ~(__GFP_HIGHMEM | __GFP_HIGH);
 242
 243        desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
 244        if (!desc)
 245                return NULL;
 246
 247        for (i = 0; i < total_sg; i++)
 248                desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
 249        return desc;
 250}
 251
 252static inline int virtqueue_add(struct virtqueue *_vq,
 253                                struct scatterlist *sgs[],
 254                                unsigned int total_sg,
 255                                unsigned int out_sgs,
 256                                unsigned int in_sgs,
 257                                void *data,
 258                                gfp_t gfp)
 259{
 260        struct vring_virtqueue *vq = to_vvq(_vq);
 261        struct scatterlist *sg;
 262        struct vring_desc *desc;
 263        unsigned int i, n, avail, descs_used, uninitialized_var(prev), err_idx;
 264        int head;
 265        bool indirect;
 266
 267        START_USE(vq);
 268
 269        BUG_ON(data == NULL);
 270
 271        if (unlikely(vq->broken)) {
 272                END_USE(vq);
 273                return -EIO;
 274        }
 275
 276#ifdef DEBUG
 277        {
 278                ktime_t now = ktime_get();
 279
 280                /* No kick or get, with .1 second between?  Warn. */
 281                if (vq->last_add_time_valid)
 282                        WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
 283                                            > 100);
 284                vq->last_add_time = now;
 285                vq->last_add_time_valid = true;
 286        }
 287#endif
 288
 289        BUG_ON(total_sg > vq->vring.num);
 290        BUG_ON(total_sg == 0);
 291
 292        head = vq->free_head;
 293
 294        /* If the host supports indirect descriptor tables, and we have multiple
 295         * buffers, then go indirect. FIXME: tune this threshold */
 296        if (vq->indirect && total_sg > 1 && vq->vq.num_free)
 297                desc = alloc_indirect(_vq, total_sg, gfp);
 298        else
 299                desc = NULL;
 300
 301        if (desc) {
 302                /* Use a single buffer which doesn't continue */
 303                indirect = true;
 304                /* Set up rest to use this indirect table. */
 305                i = 0;
 306                descs_used = 1;
 307        } else {
 308                indirect = false;
 309                desc = vq->vring.desc;
 310                i = head;
 311                descs_used = total_sg;
 312        }
 313
 314        if (vq->vq.num_free < descs_used) {
 315                pr_debug("Can't add buf len %i - avail = %i\n",
 316                         descs_used, vq->vq.num_free);
 317                /* FIXME: for historical reasons, we force a notify here if
 318                 * there are outgoing parts to the buffer.  Presumably the
 319                 * host should service the ring ASAP. */
 320                if (out_sgs)
 321                        vq->notify(&vq->vq);
 322                END_USE(vq);
 323                return -ENOSPC;
 324        }
 325
 326        for (n = 0; n < out_sgs; n++) {
 327                for (sg = sgs[n]; sg; sg = sg_next(sg)) {
 328                        dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE);
 329                        if (vring_mapping_error(vq, addr))
 330                                goto unmap_release;
 331
 332                        desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT);
 333                        desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
 334                        desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
 335                        prev = i;
 336                        i = virtio16_to_cpu(_vq->vdev, desc[i].next);
 337                }
 338        }
 339        for (; n < (out_sgs + in_sgs); n++) {
 340                for (sg = sgs[n]; sg; sg = sg_next(sg)) {
 341                        dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
 342                        if (vring_mapping_error(vq, addr))
 343                                goto unmap_release;
 344
 345                        desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE);
 346                        desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
 347                        desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
 348                        prev = i;
 349                        i = virtio16_to_cpu(_vq->vdev, desc[i].next);
 350                }
 351        }
 352        /* Last one doesn't continue. */
 353        desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
 354
 355        if (indirect) {
 356                /* Now that the indirect table is filled in, map it. */
 357                dma_addr_t addr = vring_map_single(
 358                        vq, desc, total_sg * sizeof(struct vring_desc),
 359                        DMA_TO_DEVICE);
 360                if (vring_mapping_error(vq, addr))
 361                        goto unmap_release;
 362
 363                vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT);
 364                vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);
 365
 366                vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc));
 367        }
 368
 369        /* We're using some buffers from the free list. */
 370        vq->vq.num_free -= descs_used;
 371
 372        /* Update free pointer */
 373        if (indirect)
 374                vq->free_head = virtio16_to_cpu(_vq->vdev, vq->vring.desc[head].next);
 375        else
 376                vq->free_head = i;
 377
 378        /* Store token and indirect buffer state. */
 379        vq->desc_state[head].data = data;
 380        if (indirect)
 381                vq->desc_state[head].indir_desc = desc;
 382
 383        /* Put entry in available array (but don't update avail->idx until they
 384         * do sync). */
 385        avail = virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) & (vq->vring.num - 1);
 386        vq->vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
 387
 388        /* Descriptors and available array need to be set before we expose the
 389         * new available array entries. */
 390        virtio_wmb(vq->weak_barriers);
 391        vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) + 1);
 392        vq->num_added++;
 393
 394        pr_debug("Added buffer head %i to %p\n", head, vq);
 395        END_USE(vq);
 396
 397        /* This is very unlikely, but theoretically possible.  Kick
 398         * just in case. */
 399        if (unlikely(vq->num_added == (1 << 16) - 1))
 400                virtqueue_kick(_vq);
 401
 402        return 0;
 403
 404unmap_release:
 405        err_idx = i;
 406        i = head;
 407
 408        for (n = 0; n < total_sg; n++) {
 409                if (i == err_idx)
 410                        break;
 411                vring_unmap_one(vq, &desc[i]);
 412                i = vq->vring.desc[i].next;
 413        }
 414
 415        vq->vq.num_free += total_sg;
 416
 417        if (indirect)
 418                kfree(desc);
 419
 420        return -EIO;
 421}
 422
 423/**
 424 * virtqueue_add_sgs - expose buffers to other end
 425 * @vq: the struct virtqueue we're talking about.
 426 * @sgs: array of terminated scatterlists.
 427 * @out_num: the number of scatterlists readable by other side
 428 * @in_num: the number of scatterlists which are writable (after readable ones)
 429 * @data: the token identifying the buffer.
 430 * @gfp: how to do memory allocations (if necessary).
 431 *
 432 * Caller must ensure we don't call this with other virtqueue operations
 433 * at the same time (except where noted).
 434 *
 435 * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
 436 */
 437int virtqueue_add_sgs(struct virtqueue *_vq,
 438                      struct scatterlist *sgs[],
 439                      unsigned int out_sgs,
 440                      unsigned int in_sgs,
 441                      void *data,
 442                      gfp_t gfp)
 443{
 444        unsigned int i, total_sg = 0;
 445
 446        /* Count them first. */
 447        for (i = 0; i < out_sgs + in_sgs; i++) {
 448                struct scatterlist *sg;
 449                for (sg = sgs[i]; sg; sg = sg_next(sg))
 450                        total_sg++;
 451        }
 452        return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs, data, gfp);
 453}
 454EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
 455
 456/**
 457 * virtqueue_add_outbuf - expose output buffers to other end
 458 * @vq: the struct virtqueue we're talking about.
 459 * @sg: scatterlist (must be well-formed and terminated!)
 460 * @num: the number of entries in @sg readable by other side
 461 * @data: the token identifying the buffer.
 462 * @gfp: how to do memory allocations (if necessary).
 463 *
 464 * Caller must ensure we don't call this with other virtqueue operations
 465 * at the same time (except where noted).
 466 *
 467 * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
 468 */
 469int virtqueue_add_outbuf(struct virtqueue *vq,
 470                         struct scatterlist *sg, unsigned int num,
 471                         void *data,
 472                         gfp_t gfp)
 473{
 474        return virtqueue_add(vq, &sg, num, 1, 0, data, gfp);
 475}
 476EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
 477
 478/**
 479 * virtqueue_add_inbuf - expose input buffers to other end
 480 * @vq: the struct virtqueue we're talking about.
 481 * @sg: scatterlist (must be well-formed and terminated!)
 482 * @num: the number of entries in @sg writable by other side
 483 * @data: the token identifying the buffer.
 484 * @gfp: how to do memory allocations (if necessary).
 485 *
 486 * Caller must ensure we don't call this with other virtqueue operations
 487 * at the same time (except where noted).
 488 *
 489 * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
 490 */
 491int virtqueue_add_inbuf(struct virtqueue *vq,
 492                        struct scatterlist *sg, unsigned int num,
 493                        void *data,
 494                        gfp_t gfp)
 495{
 496        return virtqueue_add(vq, &sg, num, 0, 1, data, gfp);
 497}
 498EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
 499
 500/**
 501 * virtqueue_kick_prepare - first half of split virtqueue_kick call.
 502 * @vq: the struct virtqueue
 503 *
 504 * Instead of virtqueue_kick(), you can do:
 505 *      if (virtqueue_kick_prepare(vq))
 506 *              virtqueue_notify(vq);
 507 *
 508 * This is sometimes useful because the virtqueue_kick_prepare() needs
 509 * to be serialized, but the actual virtqueue_notify() call does not.
 510 */
 511bool virtqueue_kick_prepare(struct virtqueue *_vq)
 512{
 513        struct vring_virtqueue *vq = to_vvq(_vq);
 514        u16 new, old;
 515        bool needs_kick;
 516
 517        START_USE(vq);
 518        /* We need to expose available array entries before checking avail
 519         * event. */
 520        virtio_mb(vq->weak_barriers);
 521
 522        old = virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) - vq->num_added;
 523        new = virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx);
 524        vq->num_added = 0;
 525
 526#ifdef DEBUG
 527        if (vq->last_add_time_valid) {
 528                WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
 529                                              vq->last_add_time)) > 100);
 530        }
 531        vq->last_add_time_valid = false;
 532#endif
 533
 534        if (vq->event) {
 535                needs_kick = vring_need_event(virtio16_to_cpu(_vq->vdev, vring_avail_event(&vq->vring)),
 536                                              new, old);
 537        } else {
 538                needs_kick = !(vq->vring.used->flags & cpu_to_virtio16(_vq->vdev, VRING_USED_F_NO_NOTIFY));
 539        }
 540        END_USE(vq);
 541        return needs_kick;
 542}
 543EXPORT_SYMBOL_GPL(virtqueue_kick_prepare);
 544
 545/**
 546 * virtqueue_notify - second half of split virtqueue_kick call.
 547 * @vq: the struct virtqueue
 548 *
 549 * This does not need to be serialized.
 550 *
 551 * Returns false if host notify failed or queue is broken, otherwise true.
 552 */
 553bool virtqueue_notify(struct virtqueue *_vq)
 554{
 555        struct vring_virtqueue *vq = to_vvq(_vq);
 556
 557        if (unlikely(vq->broken))
 558                return false;
 559
 560        /* Prod other side to tell it about changes. */
 561        if (!vq->notify(_vq)) {
 562                vq->broken = true;
 563                return false;
 564        }
 565        return true;
 566}
 567EXPORT_SYMBOL_GPL(virtqueue_notify);
 568
 569/**
 570 * virtqueue_kick - update after add_buf
 571 * @vq: the struct virtqueue
 572 *
 573 * After one or more virtqueue_add_* calls, invoke this to kick
 574 * the other side.
 575 *
 576 * Caller must ensure we don't call this with other virtqueue
 577 * operations at the same time (except where noted).
 578 *
 579 * Returns false if kick failed, otherwise true.
 580 */
 581bool virtqueue_kick(struct virtqueue *vq)
 582{
 583        if (virtqueue_kick_prepare(vq))
 584                return virtqueue_notify(vq);
 585        return true;
 586}
 587EXPORT_SYMBOL_GPL(virtqueue_kick);
 588
 589static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
 590{
 591        unsigned int i, j;
 592        u16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
 593
 594        /* Clear data ptr. */
 595        vq->desc_state[head].data = NULL;
 596
 597        /* Put back on free list: unmap first-level descriptors and find end */
 598        i = head;
 599
 600        while (vq->vring.desc[i].flags & nextflag) {
 601                vring_unmap_one(vq, &vq->vring.desc[i]);
 602                i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
 603                vq->vq.num_free++;
 604        }
 605
 606        vring_unmap_one(vq, &vq->vring.desc[i]);
 607        vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
 608        vq->free_head = head;
 609
 610        /* Plus final descriptor */
 611        vq->vq.num_free++;
 612
 613        /* Free the indirect table, if any, now that it's unmapped. */
 614        if (vq->desc_state[head].indir_desc) {
 615                struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
 616                u32 len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len);
 617
 618                BUG_ON(!(vq->vring.desc[head].flags &
 619                         cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
 620                BUG_ON(len == 0 || len % sizeof(struct vring_desc));
 621
 622                for (j = 0; j < len / sizeof(struct vring_desc); j++)
 623                        vring_unmap_one(vq, &indir_desc[j]);
 624
 625                kfree(vq->desc_state[head].indir_desc);
 626                vq->desc_state[head].indir_desc = NULL;
 627        }
 628}
 629
 630static inline bool more_used(const struct vring_virtqueue *vq)
 631{
 632        return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
 633}
 634
 635/**
 636 * virtqueue_get_buf - get the next used buffer
 637 * @vq: the struct virtqueue we're talking about.
 638 * @len: the length written into the buffer
 639 *
 640 * If the driver wrote data into the buffer, @len will be set to the
 641 * amount written.  This means you don't need to clear the buffer
 642 * beforehand to ensure there's no data leakage in the case of short
 643 * writes.
 644 *
 645 * Caller must ensure we don't call this with other virtqueue
 646 * operations at the same time (except where noted).
 647 *
 648 * Returns NULL if there are no used buffers, or the "data" token
 649 * handed to virtqueue_add_*().
 650 */
 651void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
 652{
 653        struct vring_virtqueue *vq = to_vvq(_vq);
 654        void *ret;
 655        unsigned int i;
 656        u16 last_used;
 657
 658        START_USE(vq);
 659
 660        if (unlikely(vq->broken)) {
 661                END_USE(vq);
 662                return NULL;
 663        }
 664
 665        if (!more_used(vq)) {
 666                pr_debug("No more buffers in queue\n");
 667                END_USE(vq);
 668                return NULL;
 669        }
 670
 671        /* Only get used array entries after they have been exposed by host. */
 672        virtio_rmb(vq->weak_barriers);
 673
 674        last_used = (vq->last_used_idx & (vq->vring.num - 1));
 675        i = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].id);
 676        *len = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].len);
 677
 678        if (unlikely(i >= vq->vring.num)) {
 679                BAD_RING(vq, "id %u out of range\n", i);
 680                return NULL;
 681        }
 682        if (unlikely(!vq->desc_state[i].data)) {
 683                BAD_RING(vq, "id %u is not a head!\n", i);
 684                return NULL;
 685        }
 686
 687        /* detach_buf clears data, so grab it now. */
 688        ret = vq->desc_state[i].data;
 689        detach_buf(vq, i);
 690        vq->last_used_idx++;
 691        /* If we expect an interrupt for the next entry, tell host
 692         * by writing event index and flush out the write before
 693         * the read in the next get_buf call. */
 694        if (!(vq->vring.avail->flags & cpu_to_virtio16(_vq->vdev, VRING_AVAIL_F_NO_INTERRUPT))) {
 695                vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, vq->last_used_idx);
 696                virtio_mb(vq->weak_barriers);
 697        }
 698
 699#ifdef DEBUG
 700        vq->last_add_time_valid = false;
 701#endif
 702
 703        END_USE(vq);
 704        return ret;
 705}
 706EXPORT_SYMBOL_GPL(virtqueue_get_buf);
 707
 708/**
 709 * virtqueue_disable_cb - disable callbacks
 710 * @vq: the struct virtqueue we're talking about.
 711 *
 712 * Note that this is not necessarily synchronous, hence unreliable and only
 713 * useful as an optimization.
 714 *
 715 * Unlike other operations, this need not be serialized.
 716 */
 717void virtqueue_disable_cb(struct virtqueue *_vq)
 718{
 719        struct vring_virtqueue *vq = to_vvq(_vq);
 720
 721        vq->vring.avail->flags |= cpu_to_virtio16(_vq->vdev, VRING_AVAIL_F_NO_INTERRUPT);
 722}
 723EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
 724
 725/**
 726 * virtqueue_enable_cb_prepare - restart callbacks after disable_cb
 727 * @vq: the struct virtqueue we're talking about.
 728 *
 729 * This re-enables callbacks; it returns current queue state
 730 * in an opaque unsigned value. This value should be later tested by
 731 * virtqueue_poll, to detect a possible race between the driver checking for
 732 * more work, and enabling callbacks.
 733 *
 734 * Caller must ensure we don't call this with other virtqueue
 735 * operations at the same time (except where noted).
 736 */
 737unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
 738{
 739        struct vring_virtqueue *vq = to_vvq(_vq);
 740        u16 last_used_idx;
 741
 742        START_USE(vq);
 743
 744        /* We optimistically turn back on interrupts, then check if there was
 745         * more to do. */
 746        /* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
 747         * either clear the flags bit or point the event index at the next
 748         * entry. Always do both to keep code simple. */
 749        vq->vring.avail->flags &= cpu_to_virtio16(_vq->vdev, ~VRING_AVAIL_F_NO_INTERRUPT);
 750        vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
 751        END_USE(vq);
 752        return last_used_idx;
 753}
 754EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare);
 755
 756/**
 757 * virtqueue_poll - query pending used buffers
 758 * @vq: the struct virtqueue we're talking about.
 759 * @last_used_idx: virtqueue state (from call to virtqueue_enable_cb_prepare).
 760 *
 761 * Returns "true" if there are pending used buffers in the queue.
 762 *
 763 * This does not need to be serialized.
 764 */
 765bool virtqueue_poll(struct virtqueue *_vq, unsigned last_used_idx)
 766{
 767        struct vring_virtqueue *vq = to_vvq(_vq);
 768
 769        virtio_mb(vq->weak_barriers);
 770        return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
 771}
 772EXPORT_SYMBOL_GPL(virtqueue_poll);
 773
 774/**
 775 * virtqueue_enable_cb - restart callbacks after disable_cb.
 776 * @vq: the struct virtqueue we're talking about.
 777 *
 778 * This re-enables callbacks; it returns "false" if there are pending
 779 * buffers in the queue, to detect a possible race between the driver
 780 * checking for more work, and enabling callbacks.
 781 *
 782 * Caller must ensure we don't call this with other virtqueue
 783 * operations at the same time (except where noted).
 784 */
 785bool virtqueue_enable_cb(struct virtqueue *_vq)
 786{
 787        unsigned last_used_idx = virtqueue_enable_cb_prepare(_vq);
 788        return !virtqueue_poll(_vq, last_used_idx);
 789}
 790EXPORT_SYMBOL_GPL(virtqueue_enable_cb);
 791
 792/**
 793 * virtqueue_enable_cb_delayed - restart callbacks after disable_cb.
 794 * @vq: the struct virtqueue we're talking about.
 795 *
 796 * This re-enables callbacks but hints to the other side to delay
 797 * interrupts until most of the available buffers have been processed;
 798 * it returns "false" if there are many pending buffers in the queue,
 799 * to detect a possible race between the driver checking for more work,
 800 * and enabling callbacks.
 801 *
 802 * Caller must ensure we don't call this with other virtqueue
 803 * operations at the same time (except where noted).
 804 */
 805bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
 806{
 807        struct vring_virtqueue *vq = to_vvq(_vq);
 808        u16 bufs;
 809
 810        START_USE(vq);
 811
 812        /* We optimistically turn back on interrupts, then check if there was
 813         * more to do. */
 814        /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
 815         * either clear the flags bit or point the event index at the next
 816         * entry. Always do both to keep code simple. */
 817        vq->vring.avail->flags &= cpu_to_virtio16(_vq->vdev, ~VRING_AVAIL_F_NO_INTERRUPT);
 818        /* TODO: tune this threshold */
 819        bufs = (u16)(virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) - vq->last_used_idx) * 3 / 4;
 820        vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs);
 821        virtio_mb(vq->weak_barriers);
 822        if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
 823                END_USE(vq);
 824                return false;
 825        }
 826
 827        END_USE(vq);
 828        return true;
 829}
 830EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
 831
 832/**
 833 * virtqueue_detach_unused_buf - detach first unused buffer
 834 * @vq: the struct virtqueue we're talking about.
 835 *
 836 * Returns NULL or the "data" token handed to virtqueue_add_*().
 837 * This is not valid on an active queue; it is useful only for device
 838 * shutdown.
 839 */
 840void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
 841{
 842        struct vring_virtqueue *vq = to_vvq(_vq);
 843        unsigned int i;
 844        void *buf;
 845
 846        START_USE(vq);
 847
 848        for (i = 0; i < vq->vring.num; i++) {
 849                if (!vq->desc_state[i].data)
 850                        continue;
 851                /* detach_buf clears data, so grab it now. */
 852                buf = vq->desc_state[i].data;
 853                detach_buf(vq, i);
 854                vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) - 1);
 855                END_USE(vq);
 856                return buf;
 857        }
 858        /* That should have freed everything. */
 859        BUG_ON(vq->vq.num_free != vq->vring.num);
 860
 861        END_USE(vq);
 862        return NULL;
 863}
 864EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf);
 865
 866irqreturn_t vring_interrupt(int irq, void *_vq)
 867{
 868        struct vring_virtqueue *vq = to_vvq(_vq);
 869
 870        if (!more_used(vq)) {
 871                pr_debug("virtqueue interrupt with no work for %p\n", vq);
 872                return IRQ_NONE;
 873        }
 874
 875        if (unlikely(vq->broken))
 876                return IRQ_HANDLED;
 877
 878        pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback);
 879        if (vq->vq.callback)
 880                vq->vq.callback(&vq->vq);
 881
 882        return IRQ_HANDLED;
 883}
 884EXPORT_SYMBOL_GPL(vring_interrupt);
 885
 886struct virtqueue *__vring_new_virtqueue(unsigned int index,
 887                                        struct vring vring,
 888                                        struct virtio_device *vdev,
 889                                        bool weak_barriers,
 890                                        bool (*notify)(struct virtqueue *),
 891                                        void (*callback)(struct virtqueue *),
 892                                        const char *name)
 893{
 894        unsigned int i;
 895        struct vring_virtqueue *vq;
 896
 897        vq = kmalloc(sizeof(*vq) + vring.num * sizeof(struct vring_desc_state),
 898                     GFP_KERNEL);
 899        if (!vq)
 900                return NULL;
 901
 902        vq->vring = vring;
 903        vq->vq.callback = callback;
 904        vq->vq.vdev = vdev;
 905        vq->vq.name = name;
 906        vq->vq.num_free = vring.num;
 907        vq->vq.index = index;
 908        vq->we_own_ring = false;
 909        vq->queue_dma_addr = 0;
 910        vq->queue_size_in_bytes = 0;
 911        vq->notify = notify;
 912        vq->weak_barriers = weak_barriers;
 913        vq->broken = false;
 914        vq->last_used_idx = 0;
 915        vq->num_added = 0;
 916        list_add_tail(&vq->vq.list, &vdev->vqs);
 917#ifdef DEBUG
 918        vq->in_use = false;
 919        vq->last_add_time_valid = false;
 920#endif
 921
 922        vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC);
 923        vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
 924
 925        /* No callback?  Tell other side not to bother us. */
 926        if (!callback)
 927                vq->vring.avail->flags |= cpu_to_virtio16(vdev, VRING_AVAIL_F_NO_INTERRUPT);
 928
 929        /* Put everything in free lists. */
 930        vq->free_head = 0;
 931        for (i = 0; i < vring.num-1; i++)
 932                vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
 933        memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state));
 934
 935        return &vq->vq;
 936}
 937EXPORT_SYMBOL_GPL(__vring_new_virtqueue);
 938
 939static void *vring_alloc_queue(struct virtio_device *vdev, size_t size,
 940                              dma_addr_t *dma_handle, gfp_t flag)
 941{
 942        if (vring_use_dma_api(vdev)) {
 943                return dma_alloc_coherent(vdev->dev.parent, size,
 944                                          dma_handle, flag);
 945        } else {
 946                void *queue = alloc_pages_exact(PAGE_ALIGN(size), flag);
 947                if (queue) {
 948                        phys_addr_t phys_addr = virt_to_phys(queue);
 949                        *dma_handle = (dma_addr_t)phys_addr;
 950
 951                        /*
 952                         * Sanity check: make sure we dind't truncate
 953                         * the address.  The only arches I can find that
 954                         * have 64-bit phys_addr_t but 32-bit dma_addr_t
 955                         * are certain non-highmem MIPS and x86
 956                         * configurations, but these configurations
 957                         * should never allocate physical pages above 32
 958                         * bits, so this is fine.  Just in case, throw a
 959                         * warning and abort if we end up with an
 960                         * unrepresentable address.
 961                         */
 962                        if (WARN_ON_ONCE(*dma_handle != phys_addr)) {
 963                                free_pages_exact(queue, PAGE_ALIGN(size));
 964                                return NULL;
 965                        }
 966                }
 967                return queue;
 968        }
 969}
 970
 971static void vring_free_queue(struct virtio_device *vdev, size_t size,
 972                             void *queue, dma_addr_t dma_handle)
 973{
 974        if (vring_use_dma_api(vdev)) {
 975                dma_free_coherent(vdev->dev.parent, size, queue, dma_handle);
 976        } else {
 977                free_pages_exact(queue, PAGE_ALIGN(size));
 978        }
 979}
 980
 981struct virtqueue *vring_create_virtqueue(
 982        unsigned int index,
 983        unsigned int num,
 984        unsigned int vring_align,
 985        struct virtio_device *vdev,
 986        bool weak_barriers,
 987        bool may_reduce_num,
 988        bool (*notify)(struct virtqueue *),
 989        void (*callback)(struct virtqueue *),
 990        const char *name)
 991{
 992        struct virtqueue *vq;
 993        void *queue = NULL;
 994        dma_addr_t dma_addr;
 995        size_t queue_size_in_bytes;
 996        struct vring vring;
 997
 998        /* We assume num is a power of 2. */
 999        if (num & (num - 1)) {
1000                dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num);
1001                return NULL;
1002        }
1003
1004        /* TODO: allocate each queue chunk individually */
1005        for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) {
1006                queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
1007                                          &dma_addr,
1008                                          GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO);
1009                if (queue)
1010                        break;
1011                if (!may_reduce_num)
1012                        return NULL;
1013        }
1014
1015        if (!num)
1016                return NULL;
1017
1018        if (!queue) {
1019                /* Try to get a single page. You are my only hope! */
1020                queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
1021                                          &dma_addr, GFP_KERNEL|__GFP_ZERO);
1022        }
1023        if (!queue)
1024                return NULL;
1025
1026        queue_size_in_bytes = vring_size(num, vring_align);
1027        vring_init(&vring, num, queue, vring_align);
1028
1029        vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers,
1030                                   notify, callback, name);
1031        if (!vq) {
1032                vring_free_queue(vdev, queue_size_in_bytes, queue,
1033                                 dma_addr);
1034                return NULL;
1035        }
1036
1037        to_vvq(vq)->queue_dma_addr = dma_addr;
1038        to_vvq(vq)->queue_size_in_bytes = queue_size_in_bytes;
1039        to_vvq(vq)->we_own_ring = true;
1040
1041        return vq;
1042}
1043EXPORT_SYMBOL_GPL(vring_create_virtqueue);
1044
1045struct virtqueue *vring_new_virtqueue(unsigned int index,
1046                                      unsigned int num,
1047                                      unsigned int vring_align,
1048                                      struct virtio_device *vdev,
1049                                      bool weak_barriers,
1050                                      void *pages,
1051                                      bool (*notify)(struct virtqueue *vq),
1052                                      void (*callback)(struct virtqueue *vq),
1053                                      const char *name)
1054{
1055        struct vring vring;
1056        vring_init(&vring, num, pages, vring_align);
1057        return __vring_new_virtqueue(index, vring, vdev, weak_barriers,
1058                                     notify, callback, name);
1059}
1060EXPORT_SYMBOL_GPL(vring_new_virtqueue);
1061
1062void vring_del_virtqueue(struct virtqueue *_vq)
1063{
1064        struct vring_virtqueue *vq = to_vvq(_vq);
1065
1066        if (vq->we_own_ring) {
1067                vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes,
1068                                 vq->vring.desc, vq->queue_dma_addr);
1069        }
1070        list_del(&_vq->list);
1071        kfree(vq);
1072}
1073EXPORT_SYMBOL_GPL(vring_del_virtqueue);
1074
1075/* Manipulates transport-specific feature bits. */
1076void vring_transport_features(struct virtio_device *vdev)
1077{
1078        unsigned int i;
1079
1080        for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
1081                switch (i) {
1082                case VIRTIO_RING_F_INDIRECT_DESC:
1083                        break;
1084                case VIRTIO_RING_F_EVENT_IDX:
1085                        break;
1086                case VIRTIO_F_VERSION_1:
1087                        break;
1088                case VIRTIO_F_IOMMU_PLATFORM:
1089                        break;
1090                default:
1091                        /* We don't understand this bit. */
1092                        __virtio_clear_bit(vdev, i);
1093                }
1094        }
1095}
1096EXPORT_SYMBOL_GPL(vring_transport_features);
1097
1098/**
1099 * virtqueue_get_vring_size - return the size of the virtqueue's vring
1100 * @vq: the struct virtqueue containing the vring of interest.
1101 *
1102 * Returns the size of the vring.  This is mainly used for boasting to
1103 * userspace.  Unlike other operations, this need not be serialized.
1104 */
1105unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
1106{
1107
1108        struct vring_virtqueue *vq = to_vvq(_vq);
1109
1110        return vq->vring.num;
1111}
1112EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
1113
1114bool virtqueue_is_broken(struct virtqueue *_vq)
1115{
1116        struct vring_virtqueue *vq = to_vvq(_vq);
1117
1118        return vq->broken;
1119}
1120EXPORT_SYMBOL_GPL(virtqueue_is_broken);
1121
1122/*
1123 * This should prevent the device from being used, allowing drivers to
1124 * recover.  You may need to grab appropriate locks to flush.
1125 */
1126void virtio_break_device(struct virtio_device *dev)
1127{
1128        struct virtqueue *_vq;
1129
1130        list_for_each_entry(_vq, &dev->vqs, list) {
1131                struct vring_virtqueue *vq = to_vvq(_vq);
1132                vq->broken = true;
1133        }
1134}
1135EXPORT_SYMBOL_GPL(virtio_break_device);
1136
1137dma_addr_t virtqueue_get_desc_addr(struct virtqueue *_vq)
1138{
1139        struct vring_virtqueue *vq = to_vvq(_vq);
1140
1141        BUG_ON(!vq->we_own_ring);
1142
1143        return vq->queue_dma_addr;
1144}
1145EXPORT_SYMBOL_GPL(virtqueue_get_desc_addr);
1146
1147dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
1148{
1149        struct vring_virtqueue *vq = to_vvq(_vq);
1150
1151        BUG_ON(!vq->we_own_ring);
1152
1153        return vq->queue_dma_addr +
1154                ((char *)vq->vring.avail - (char *)vq->vring.desc);
1155}
1156EXPORT_SYMBOL_GPL(virtqueue_get_avail_addr);
1157
1158dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
1159{
1160        struct vring_virtqueue *vq = to_vvq(_vq);
1161
1162        BUG_ON(!vq->we_own_ring);
1163
1164        return vq->queue_dma_addr +
1165                ((char *)vq->vring.used - (char *)vq->vring.desc);
1166}
1167EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);
1168
1169const struct vring *virtqueue_get_vring(struct virtqueue *vq)
1170{
1171        return &to_vvq(vq)->vring;
1172}
1173EXPORT_SYMBOL_GPL(virtqueue_get_vring);
1174
1175MODULE_LICENSE("GPL");
1176