linux/drivers/hv/ring_buffer.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *
   4 * Copyright (c) 2009, Microsoft Corporation.
   5 *
   6 * Authors:
   7 *   Haiyang Zhang <haiyangz@microsoft.com>
   8 *   Hank Janssen  <hjanssen@microsoft.com>
   9 *   K. Y. Srinivasan <kys@microsoft.com>
  10 */
  11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  12
  13#include <linux/kernel.h>
  14#include <linux/mm.h>
  15#include <linux/hyperv.h>
  16#include <linux/uio.h>
  17#include <linux/vmalloc.h>
  18#include <linux/slab.h>
  19#include <linux/prefetch.h>
  20
  21#include "hyperv_vmbus.h"
  22
  23#define VMBUS_PKT_TRAILER       8
  24
  25/*
  26 * When we write to the ring buffer, check if the host needs to
  27 * be signaled. Here is the details of this protocol:
  28 *
  29 *      1. The host guarantees that while it is draining the
  30 *         ring buffer, it will set the interrupt_mask to
  31 *         indicate it does not need to be interrupted when
  32 *         new data is placed.
  33 *
  34 *      2. The host guarantees that it will completely drain
  35 *         the ring buffer before exiting the read loop. Further,
  36 *         once the ring buffer is empty, it will clear the
  37 *         interrupt_mask and re-check to see if new data has
  38 *         arrived.
  39 *
  40 * KYS: Oct. 30, 2016:
  41 * It looks like Windows hosts have logic to deal with DOS attacks that
  42 * can be triggered if it receives interrupts when it is not expecting
  43 * the interrupt. The host expects interrupts only when the ring
  44 * transitions from empty to non-empty (or full to non full on the guest
  45 * to host ring).
  46 * So, base the signaling decision solely on the ring state until the
  47 * host logic is fixed.
  48 */
  49
  50static void hv_signal_on_write(u32 old_write, struct vmbus_channel *channel)
  51{
  52        struct hv_ring_buffer_info *rbi = &channel->outbound;
  53
  54        virt_mb();
  55        if (READ_ONCE(rbi->ring_buffer->interrupt_mask))
  56                return;
  57
  58        /* check interrupt_mask before read_index */
  59        virt_rmb();
  60        /*
  61         * This is the only case we need to signal when the
  62         * ring transitions from being empty to non-empty.
  63         */
  64        if (old_write == READ_ONCE(rbi->ring_buffer->read_index)) {
  65                ++channel->intr_out_empty;
  66                vmbus_setevent(channel);
  67        }
  68}
  69
  70/* Get the next write location for the specified ring buffer. */
  71static inline u32
  72hv_get_next_write_location(struct hv_ring_buffer_info *ring_info)
  73{
  74        u32 next = ring_info->ring_buffer->write_index;
  75
  76        return next;
  77}
  78
  79/* Set the next write location for the specified ring buffer. */
  80static inline void
  81hv_set_next_write_location(struct hv_ring_buffer_info *ring_info,
  82                     u32 next_write_location)
  83{
  84        ring_info->ring_buffer->write_index = next_write_location;
  85}
  86
  87/* Set the next read location for the specified ring buffer. */
  88static inline void
  89hv_set_next_read_location(struct hv_ring_buffer_info *ring_info,
  90                    u32 next_read_location)
  91{
  92        ring_info->ring_buffer->read_index = next_read_location;
  93        ring_info->priv_read_index = next_read_location;
  94}
  95
  96/* Get the size of the ring buffer. */
  97static inline u32
  98hv_get_ring_buffersize(const struct hv_ring_buffer_info *ring_info)
  99{
 100        return ring_info->ring_datasize;
 101}
 102
 103/* Get the read and write indices as u64 of the specified ring buffer. */
 104static inline u64
 105hv_get_ring_bufferindices(struct hv_ring_buffer_info *ring_info)
 106{
 107        return (u64)ring_info->ring_buffer->write_index << 32;
 108}
 109
 110/*
 111 * Helper routine to copy from source to ring buffer.
 112 * Assume there is enough room. Handles wrap-around in dest case only!!
 113 */
 114static u32 hv_copyto_ringbuffer(
 115        struct hv_ring_buffer_info      *ring_info,
 116        u32                             start_write_offset,
 117        const void                      *src,
 118        u32                             srclen)
 119{
 120        void *ring_buffer = hv_get_ring_buffer(ring_info);
 121        u32 ring_buffer_size = hv_get_ring_buffersize(ring_info);
 122
 123        memcpy(ring_buffer + start_write_offset, src, srclen);
 124
 125        start_write_offset += srclen;
 126        if (start_write_offset >= ring_buffer_size)
 127                start_write_offset -= ring_buffer_size;
 128
 129        return start_write_offset;
 130}
 131
 132/*
 133 *
 134 * hv_get_ringbuffer_availbytes()
 135 *
 136 * Get number of bytes available to read and to write to
 137 * for the specified ring buffer
 138 */
 139static void
 140hv_get_ringbuffer_availbytes(const struct hv_ring_buffer_info *rbi,
 141                             u32 *read, u32 *write)
 142{
 143        u32 read_loc, write_loc, dsize;
 144
 145        /* Capture the read/write indices before they changed */
 146        read_loc = READ_ONCE(rbi->ring_buffer->read_index);
 147        write_loc = READ_ONCE(rbi->ring_buffer->write_index);
 148        dsize = rbi->ring_datasize;
 149
 150        *write = write_loc >= read_loc ? dsize - (write_loc - read_loc) :
 151                read_loc - write_loc;
 152        *read = dsize - *write;
 153}
 154
 155/* Get various debug metrics for the specified ring buffer. */
 156int hv_ringbuffer_get_debuginfo(struct hv_ring_buffer_info *ring_info,
 157                                struct hv_ring_buffer_debug_info *debug_info)
 158{
 159        u32 bytes_avail_towrite;
 160        u32 bytes_avail_toread;
 161
 162        mutex_lock(&ring_info->ring_buffer_mutex);
 163
 164        if (!ring_info->ring_buffer) {
 165                mutex_unlock(&ring_info->ring_buffer_mutex);
 166                return -EINVAL;
 167        }
 168
 169        hv_get_ringbuffer_availbytes(ring_info,
 170                                     &bytes_avail_toread,
 171                                     &bytes_avail_towrite);
 172        debug_info->bytes_avail_toread = bytes_avail_toread;
 173        debug_info->bytes_avail_towrite = bytes_avail_towrite;
 174        debug_info->current_read_index = ring_info->ring_buffer->read_index;
 175        debug_info->current_write_index = ring_info->ring_buffer->write_index;
 176        debug_info->current_interrupt_mask
 177                = ring_info->ring_buffer->interrupt_mask;
 178        mutex_unlock(&ring_info->ring_buffer_mutex);
 179
 180        return 0;
 181}
 182EXPORT_SYMBOL_GPL(hv_ringbuffer_get_debuginfo);
 183
 184/* Initialize a channel's ring buffer info mutex locks */
 185void hv_ringbuffer_pre_init(struct vmbus_channel *channel)
 186{
 187        mutex_init(&channel->inbound.ring_buffer_mutex);
 188        mutex_init(&channel->outbound.ring_buffer_mutex);
 189}
 190
 191/* Initialize the ring buffer. */
 192int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
 193                       struct page *pages, u32 page_cnt)
 194{
 195        int i;
 196        struct page **pages_wraparound;
 197
 198        BUILD_BUG_ON((sizeof(struct hv_ring_buffer) != PAGE_SIZE));
 199
 200        /*
 201         * First page holds struct hv_ring_buffer, do wraparound mapping for
 202         * the rest.
 203         */
 204        pages_wraparound = kcalloc(page_cnt * 2 - 1, sizeof(struct page *),
 205                                   GFP_KERNEL);
 206        if (!pages_wraparound)
 207                return -ENOMEM;
 208
 209        pages_wraparound[0] = pages;
 210        for (i = 0; i < 2 * (page_cnt - 1); i++)
 211                pages_wraparound[i + 1] = &pages[i % (page_cnt - 1) + 1];
 212
 213        ring_info->ring_buffer = (struct hv_ring_buffer *)
 214                vmap(pages_wraparound, page_cnt * 2 - 1, VM_MAP, PAGE_KERNEL);
 215
 216        kfree(pages_wraparound);
 217
 218
 219        if (!ring_info->ring_buffer)
 220                return -ENOMEM;
 221
 222        ring_info->ring_buffer->read_index =
 223                ring_info->ring_buffer->write_index = 0;
 224
 225        /* Set the feature bit for enabling flow control. */
 226        ring_info->ring_buffer->feature_bits.value = 1;
 227
 228        ring_info->ring_size = page_cnt << PAGE_SHIFT;
 229        ring_info->ring_size_div10_reciprocal =
 230                reciprocal_value(ring_info->ring_size / 10);
 231        ring_info->ring_datasize = ring_info->ring_size -
 232                sizeof(struct hv_ring_buffer);
 233        ring_info->priv_read_index = 0;
 234
 235        spin_lock_init(&ring_info->ring_lock);
 236
 237        return 0;
 238}
 239
 240/* Cleanup the ring buffer. */
 241void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info)
 242{
 243        mutex_lock(&ring_info->ring_buffer_mutex);
 244        vunmap(ring_info->ring_buffer);
 245        ring_info->ring_buffer = NULL;
 246        mutex_unlock(&ring_info->ring_buffer_mutex);
 247}
 248
 249/* Write to the ring buffer. */
 250int hv_ringbuffer_write(struct vmbus_channel *channel,
 251                        const struct kvec *kv_list, u32 kv_count)
 252{
 253        int i;
 254        u32 bytes_avail_towrite;
 255        u32 totalbytes_towrite = sizeof(u64);
 256        u32 next_write_location;
 257        u32 old_write;
 258        u64 prev_indices;
 259        unsigned long flags;
 260        struct hv_ring_buffer_info *outring_info = &channel->outbound;
 261
 262        if (channel->rescind)
 263                return -ENODEV;
 264
 265        for (i = 0; i < kv_count; i++)
 266                totalbytes_towrite += kv_list[i].iov_len;
 267
 268        spin_lock_irqsave(&outring_info->ring_lock, flags);
 269
 270        bytes_avail_towrite = hv_get_bytes_to_write(outring_info);
 271
 272        /*
 273         * If there is only room for the packet, assume it is full.
 274         * Otherwise, the next time around, we think the ring buffer
 275         * is empty since the read index == write index.
 276         */
 277        if (bytes_avail_towrite <= totalbytes_towrite) {
 278                ++channel->out_full_total;
 279
 280                if (!channel->out_full_flag) {
 281                        ++channel->out_full_first;
 282                        channel->out_full_flag = true;
 283                }
 284
 285                spin_unlock_irqrestore(&outring_info->ring_lock, flags);
 286                return -EAGAIN;
 287        }
 288
 289        channel->out_full_flag = false;
 290
 291        /* Write to the ring buffer */
 292        next_write_location = hv_get_next_write_location(outring_info);
 293
 294        old_write = next_write_location;
 295
 296        for (i = 0; i < kv_count; i++) {
 297                next_write_location = hv_copyto_ringbuffer(outring_info,
 298                                                     next_write_location,
 299                                                     kv_list[i].iov_base,
 300                                                     kv_list[i].iov_len);
 301        }
 302
 303        /* Set previous packet start */
 304        prev_indices = hv_get_ring_bufferindices(outring_info);
 305
 306        next_write_location = hv_copyto_ringbuffer(outring_info,
 307                                             next_write_location,
 308                                             &prev_indices,
 309                                             sizeof(u64));
 310
 311        /* Issue a full memory barrier before updating the write index */
 312        virt_mb();
 313
 314        /* Now, update the write location */
 315        hv_set_next_write_location(outring_info, next_write_location);
 316
 317
 318        spin_unlock_irqrestore(&outring_info->ring_lock, flags);
 319
 320        hv_signal_on_write(old_write, channel);
 321
 322        if (channel->rescind)
 323                return -ENODEV;
 324
 325        return 0;
 326}
 327
 328int hv_ringbuffer_read(struct vmbus_channel *channel,
 329                       void *buffer, u32 buflen, u32 *buffer_actual_len,
 330                       u64 *requestid, bool raw)
 331{
 332        struct vmpacket_descriptor *desc;
 333        u32 packetlen, offset;
 334
 335        if (unlikely(buflen == 0))
 336                return -EINVAL;
 337
 338        *buffer_actual_len = 0;
 339        *requestid = 0;
 340
 341        /* Make sure there is something to read */
 342        desc = hv_pkt_iter_first(channel);
 343        if (desc == NULL) {
 344                /*
 345                 * No error is set when there is even no header, drivers are
 346                 * supposed to analyze buffer_actual_len.
 347                 */
 348                return 0;
 349        }
 350
 351        offset = raw ? 0 : (desc->offset8 << 3);
 352        packetlen = (desc->len8 << 3) - offset;
 353        *buffer_actual_len = packetlen;
 354        *requestid = desc->trans_id;
 355
 356        if (unlikely(packetlen > buflen))
 357                return -ENOBUFS;
 358
 359        /* since ring is double mapped, only one copy is necessary */
 360        memcpy(buffer, (const char *)desc + offset, packetlen);
 361
 362        /* Advance ring index to next packet descriptor */
 363        __hv_pkt_iter_next(channel, desc);
 364
 365        /* Notify host of update */
 366        hv_pkt_iter_close(channel);
 367
 368        return 0;
 369}
 370
 371/*
 372 * Determine number of bytes available in ring buffer after
 373 * the current iterator (priv_read_index) location.
 374 *
 375 * This is similar to hv_get_bytes_to_read but with private
 376 * read index instead.
 377 */
 378static u32 hv_pkt_iter_avail(const struct hv_ring_buffer_info *rbi)
 379{
 380        u32 priv_read_loc = rbi->priv_read_index;
 381        u32 write_loc = READ_ONCE(rbi->ring_buffer->write_index);
 382
 383        if (write_loc >= priv_read_loc)
 384                return write_loc - priv_read_loc;
 385        else
 386                return (rbi->ring_datasize - priv_read_loc) + write_loc;
 387}
 388
 389/*
 390 * Get first vmbus packet from ring buffer after read_index
 391 *
 392 * If ring buffer is empty, returns NULL and no other action needed.
 393 */
 394struct vmpacket_descriptor *hv_pkt_iter_first(struct vmbus_channel *channel)
 395{
 396        struct hv_ring_buffer_info *rbi = &channel->inbound;
 397        struct vmpacket_descriptor *desc;
 398
 399        if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor))
 400                return NULL;
 401
 402        desc = hv_get_ring_buffer(rbi) + rbi->priv_read_index;
 403        if (desc)
 404                prefetch((char *)desc + (desc->len8 << 3));
 405
 406        return desc;
 407}
 408EXPORT_SYMBOL_GPL(hv_pkt_iter_first);
 409
 410/*
 411 * Get next vmbus packet from ring buffer.
 412 *
 413 * Advances the current location (priv_read_index) and checks for more
 414 * data. If the end of the ring buffer is reached, then return NULL.
 415 */
 416struct vmpacket_descriptor *
 417__hv_pkt_iter_next(struct vmbus_channel *channel,
 418                   const struct vmpacket_descriptor *desc)
 419{
 420        struct hv_ring_buffer_info *rbi = &channel->inbound;
 421        u32 packetlen = desc->len8 << 3;
 422        u32 dsize = rbi->ring_datasize;
 423
 424        /* bump offset to next potential packet */
 425        rbi->priv_read_index += packetlen + VMBUS_PKT_TRAILER;
 426        if (rbi->priv_read_index >= dsize)
 427                rbi->priv_read_index -= dsize;
 428
 429        /* more data? */
 430        return hv_pkt_iter_first(channel);
 431}
 432EXPORT_SYMBOL_GPL(__hv_pkt_iter_next);
 433
 434/* How many bytes were read in this iterator cycle */
 435static u32 hv_pkt_iter_bytes_read(const struct hv_ring_buffer_info *rbi,
 436                                        u32 start_read_index)
 437{
 438        if (rbi->priv_read_index >= start_read_index)
 439                return rbi->priv_read_index - start_read_index;
 440        else
 441                return rbi->ring_datasize - start_read_index +
 442                        rbi->priv_read_index;
 443}
 444
 445/*
 446 * Update host ring buffer after iterating over packets. If the host has
 447 * stopped queuing new entries because it found the ring buffer full, and
 448 * sufficient space is being freed up, signal the host. But be careful to
 449 * only signal the host when necessary, both for performance reasons and
 450 * because Hyper-V protects itself by throttling guests that signal
 451 * inappropriately.
 452 *
 453 * Determining when to signal is tricky. There are three key data inputs
 454 * that must be handled in this order to avoid race conditions:
 455 *
 456 * 1. Update the read_index
 457 * 2. Read the pending_send_sz
 458 * 3. Read the current write_index
 459 *
 460 * The interrupt_mask is not used to determine when to signal. The
 461 * interrupt_mask is used only on the guest->host ring buffer when
 462 * sending requests to the host. The host does not use it on the host->
 463 * guest ring buffer to indicate whether it should be signaled.
 464 */
 465void hv_pkt_iter_close(struct vmbus_channel *channel)
 466{
 467        struct hv_ring_buffer_info *rbi = &channel->inbound;
 468        u32 curr_write_sz, pending_sz, bytes_read, start_read_index;
 469
 470        /*
 471         * Make sure all reads are done before we update the read index since
 472         * the writer may start writing to the read area once the read index
 473         * is updated.
 474         */
 475        virt_rmb();
 476        start_read_index = rbi->ring_buffer->read_index;
 477        rbi->ring_buffer->read_index = rbi->priv_read_index;
 478
 479        /*
 480         * Older versions of Hyper-V (before WS2102 and Win8) do not
 481         * implement pending_send_sz and simply poll if the host->guest
 482         * ring buffer is full.  No signaling is needed or expected.
 483         */
 484        if (!rbi->ring_buffer->feature_bits.feat_pending_send_sz)
 485                return;
 486
 487        /*
 488         * Issue a full memory barrier before making the signaling decision.
 489         * If reading pending_send_sz were to be reordered and happen
 490         * before we commit the new read_index, a race could occur.  If the
 491         * host were to set the pending_send_sz after we have sampled
 492         * pending_send_sz, and the ring buffer blocks before we commit the
 493         * read index, we could miss sending the interrupt. Issue a full
 494         * memory barrier to address this.
 495         */
 496        virt_mb();
 497
 498        /*
 499         * If the pending_send_sz is zero, then the ring buffer is not
 500         * blocked and there is no need to signal.  This is far by the
 501         * most common case, so exit quickly for best performance.
 502         */
 503        pending_sz = READ_ONCE(rbi->ring_buffer->pending_send_sz);
 504        if (!pending_sz)
 505                return;
 506
 507        /*
 508         * Ensure the read of write_index in hv_get_bytes_to_write()
 509         * happens after the read of pending_send_sz.
 510         */
 511        virt_rmb();
 512        curr_write_sz = hv_get_bytes_to_write(rbi);
 513        bytes_read = hv_pkt_iter_bytes_read(rbi, start_read_index);
 514
 515        /*
 516         * We want to signal the host only if we're transitioning
 517         * from a "not enough free space" state to a "enough free
 518         * space" state.  For example, it's possible that this function
 519         * could run and free up enough space to signal the host, and then
 520         * run again and free up additional space before the host has a
 521         * chance to clear the pending_send_sz.  The 2nd invocation would
 522         * be a null transition from "enough free space" to "enough free
 523         * space", which doesn't warrant a signal.
 524         *
 525         * Exactly filling the ring buffer is treated as "not enough
 526         * space". The ring buffer always must have at least one byte
 527         * empty so the empty and full conditions are distinguishable.
 528         * hv_get_bytes_to_write() doesn't fully tell the truth in
 529         * this regard.
 530         *
 531         * So first check if we were in the "enough free space" state
 532         * before we began the iteration. If so, the host was not
 533         * blocked, and there's no need to signal.
 534         */
 535        if (curr_write_sz - bytes_read > pending_sz)
 536                return;
 537
 538        /*
 539         * Similarly, if the new state is "not enough space", then
 540         * there's no need to signal.
 541         */
 542        if (curr_write_sz <= pending_sz)
 543                return;
 544
 545        ++channel->intr_in_full;
 546        vmbus_setevent(channel);
 547}
 548EXPORT_SYMBOL_GPL(hv_pkt_iter_close);
 549