linux/drivers/hv/ring_buffer.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *
   4 * Copyright (c) 2009, Microsoft Corporation.
   5 *
   6 * Authors:
   7 *   Haiyang Zhang <haiyangz@microsoft.com>
   8 *   Hank Janssen  <hjanssen@microsoft.com>
   9 *   K. Y. Srinivasan <kys@microsoft.com>
  10 */
  11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  12
  13#include <linux/kernel.h>
  14#include <linux/mm.h>
  15#include <linux/hyperv.h>
  16#include <linux/uio.h>
  17#include <linux/vmalloc.h>
  18#include <linux/slab.h>
  19#include <linux/prefetch.h>
  20
  21#include "hyperv_vmbus.h"
  22
  23#define VMBUS_PKT_TRAILER       8
  24
  25/*
  26 * When we write to the ring buffer, check if the host needs to
  27 * be signaled. Here is the details of this protocol:
  28 *
  29 *      1. The host guarantees that while it is draining the
  30 *         ring buffer, it will set the interrupt_mask to
  31 *         indicate it does not need to be interrupted when
  32 *         new data is placed.
  33 *
  34 *      2. The host guarantees that it will completely drain
  35 *         the ring buffer before exiting the read loop. Further,
  36 *         once the ring buffer is empty, it will clear the
  37 *         interrupt_mask and re-check to see if new data has
  38 *         arrived.
  39 *
  40 * KYS: Oct. 30, 2016:
  41 * It looks like Windows hosts have logic to deal with DOS attacks that
  42 * can be triggered if it receives interrupts when it is not expecting
  43 * the interrupt. The host expects interrupts only when the ring
  44 * transitions from empty to non-empty (or full to non full on the guest
  45 * to host ring).
  46 * So, base the signaling decision solely on the ring state until the
  47 * host logic is fixed.
  48 */
  49
  50static void hv_signal_on_write(u32 old_write, struct vmbus_channel *channel)
  51{
  52        struct hv_ring_buffer_info *rbi = &channel->outbound;
  53
  54        virt_mb();
  55        if (READ_ONCE(rbi->ring_buffer->interrupt_mask))
  56                return;
  57
  58        /* check interrupt_mask before read_index */
  59        virt_rmb();
  60        /*
  61         * This is the only case we need to signal when the
  62         * ring transitions from being empty to non-empty.
  63         */
  64        if (old_write == READ_ONCE(rbi->ring_buffer->read_index)) {
  65                ++channel->intr_out_empty;
  66                vmbus_setevent(channel);
  67        }
  68}
  69
  70/* Get the next write location for the specified ring buffer. */
  71static inline u32
  72hv_get_next_write_location(struct hv_ring_buffer_info *ring_info)
  73{
  74        u32 next = ring_info->ring_buffer->write_index;
  75
  76        return next;
  77}
  78
  79/* Set the next write location for the specified ring buffer. */
  80static inline void
  81hv_set_next_write_location(struct hv_ring_buffer_info *ring_info,
  82                     u32 next_write_location)
  83{
  84        ring_info->ring_buffer->write_index = next_write_location;
  85}
  86
  87/* Get the size of the ring buffer. */
  88static inline u32
  89hv_get_ring_buffersize(const struct hv_ring_buffer_info *ring_info)
  90{
  91        return ring_info->ring_datasize;
  92}
  93
  94/* Get the read and write indices as u64 of the specified ring buffer. */
  95static inline u64
  96hv_get_ring_bufferindices(struct hv_ring_buffer_info *ring_info)
  97{
  98        return (u64)ring_info->ring_buffer->write_index << 32;
  99}
 100
 101/*
 102 * Helper routine to copy from source to ring buffer.
 103 * Assume there is enough room. Handles wrap-around in dest case only!!
 104 */
 105static u32 hv_copyto_ringbuffer(
 106        struct hv_ring_buffer_info      *ring_info,
 107        u32                             start_write_offset,
 108        const void                      *src,
 109        u32                             srclen)
 110{
 111        void *ring_buffer = hv_get_ring_buffer(ring_info);
 112        u32 ring_buffer_size = hv_get_ring_buffersize(ring_info);
 113
 114        memcpy(ring_buffer + start_write_offset, src, srclen);
 115
 116        start_write_offset += srclen;
 117        if (start_write_offset >= ring_buffer_size)
 118                start_write_offset -= ring_buffer_size;
 119
 120        return start_write_offset;
 121}
 122
 123/*
 124 *
 125 * hv_get_ringbuffer_availbytes()
 126 *
 127 * Get number of bytes available to read and to write to
 128 * for the specified ring buffer
 129 */
 130static void
 131hv_get_ringbuffer_availbytes(const struct hv_ring_buffer_info *rbi,
 132                             u32 *read, u32 *write)
 133{
 134        u32 read_loc, write_loc, dsize;
 135
 136        /* Capture the read/write indices before they changed */
 137        read_loc = READ_ONCE(rbi->ring_buffer->read_index);
 138        write_loc = READ_ONCE(rbi->ring_buffer->write_index);
 139        dsize = rbi->ring_datasize;
 140
 141        *write = write_loc >= read_loc ? dsize - (write_loc - read_loc) :
 142                read_loc - write_loc;
 143        *read = dsize - *write;
 144}
 145
 146/* Get various debug metrics for the specified ring buffer. */
 147int hv_ringbuffer_get_debuginfo(struct hv_ring_buffer_info *ring_info,
 148                                struct hv_ring_buffer_debug_info *debug_info)
 149{
 150        u32 bytes_avail_towrite;
 151        u32 bytes_avail_toread;
 152
 153        mutex_lock(&ring_info->ring_buffer_mutex);
 154
 155        if (!ring_info->ring_buffer) {
 156                mutex_unlock(&ring_info->ring_buffer_mutex);
 157                return -EINVAL;
 158        }
 159
 160        hv_get_ringbuffer_availbytes(ring_info,
 161                                     &bytes_avail_toread,
 162                                     &bytes_avail_towrite);
 163        debug_info->bytes_avail_toread = bytes_avail_toread;
 164        debug_info->bytes_avail_towrite = bytes_avail_towrite;
 165        debug_info->current_read_index = ring_info->ring_buffer->read_index;
 166        debug_info->current_write_index = ring_info->ring_buffer->write_index;
 167        debug_info->current_interrupt_mask
 168                = ring_info->ring_buffer->interrupt_mask;
 169        mutex_unlock(&ring_info->ring_buffer_mutex);
 170
 171        return 0;
 172}
 173EXPORT_SYMBOL_GPL(hv_ringbuffer_get_debuginfo);
 174
 175/* Initialize a channel's ring buffer info mutex locks */
 176void hv_ringbuffer_pre_init(struct vmbus_channel *channel)
 177{
 178        mutex_init(&channel->inbound.ring_buffer_mutex);
 179        mutex_init(&channel->outbound.ring_buffer_mutex);
 180}
 181
 182/* Initialize the ring buffer. */
 183int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
 184                       struct page *pages, u32 page_cnt)
 185{
 186        int i;
 187        struct page **pages_wraparound;
 188
 189        BUILD_BUG_ON((sizeof(struct hv_ring_buffer) != PAGE_SIZE));
 190
 191        /*
 192         * First page holds struct hv_ring_buffer, do wraparound mapping for
 193         * the rest.
 194         */
 195        pages_wraparound = kcalloc(page_cnt * 2 - 1, sizeof(struct page *),
 196                                   GFP_KERNEL);
 197        if (!pages_wraparound)
 198                return -ENOMEM;
 199
 200        pages_wraparound[0] = pages;
 201        for (i = 0; i < 2 * (page_cnt - 1); i++)
 202                pages_wraparound[i + 1] = &pages[i % (page_cnt - 1) + 1];
 203
 204        ring_info->ring_buffer = (struct hv_ring_buffer *)
 205                vmap(pages_wraparound, page_cnt * 2 - 1, VM_MAP, PAGE_KERNEL);
 206
 207        kfree(pages_wraparound);
 208
 209
 210        if (!ring_info->ring_buffer)
 211                return -ENOMEM;
 212
 213        ring_info->ring_buffer->read_index =
 214                ring_info->ring_buffer->write_index = 0;
 215
 216        /* Set the feature bit for enabling flow control. */
 217        ring_info->ring_buffer->feature_bits.value = 1;
 218
 219        ring_info->ring_size = page_cnt << PAGE_SHIFT;
 220        ring_info->ring_size_div10_reciprocal =
 221                reciprocal_value(ring_info->ring_size / 10);
 222        ring_info->ring_datasize = ring_info->ring_size -
 223                sizeof(struct hv_ring_buffer);
 224        ring_info->priv_read_index = 0;
 225
 226        spin_lock_init(&ring_info->ring_lock);
 227
 228        return 0;
 229}
 230
 231/* Cleanup the ring buffer. */
 232void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info)
 233{
 234        mutex_lock(&ring_info->ring_buffer_mutex);
 235        vunmap(ring_info->ring_buffer);
 236        ring_info->ring_buffer = NULL;
 237        mutex_unlock(&ring_info->ring_buffer_mutex);
 238}
 239
 240/* Write to the ring buffer. */
 241int hv_ringbuffer_write(struct vmbus_channel *channel,
 242                        const struct kvec *kv_list, u32 kv_count,
 243                        u64 requestid)
 244{
 245        int i;
 246        u32 bytes_avail_towrite;
 247        u32 totalbytes_towrite = sizeof(u64);
 248        u32 next_write_location;
 249        u32 old_write;
 250        u64 prev_indices;
 251        unsigned long flags;
 252        struct hv_ring_buffer_info *outring_info = &channel->outbound;
 253        struct vmpacket_descriptor *desc = kv_list[0].iov_base;
 254        u64 rqst_id = VMBUS_NO_RQSTOR;
 255
 256        if (channel->rescind)
 257                return -ENODEV;
 258
 259        for (i = 0; i < kv_count; i++)
 260                totalbytes_towrite += kv_list[i].iov_len;
 261
 262        spin_lock_irqsave(&outring_info->ring_lock, flags);
 263
 264        bytes_avail_towrite = hv_get_bytes_to_write(outring_info);
 265
 266        /*
 267         * If there is only room for the packet, assume it is full.
 268         * Otherwise, the next time around, we think the ring buffer
 269         * is empty since the read index == write index.
 270         */
 271        if (bytes_avail_towrite <= totalbytes_towrite) {
 272                ++channel->out_full_total;
 273
 274                if (!channel->out_full_flag) {
 275                        ++channel->out_full_first;
 276                        channel->out_full_flag = true;
 277                }
 278
 279                spin_unlock_irqrestore(&outring_info->ring_lock, flags);
 280                return -EAGAIN;
 281        }
 282
 283        channel->out_full_flag = false;
 284
 285        /* Write to the ring buffer */
 286        next_write_location = hv_get_next_write_location(outring_info);
 287
 288        old_write = next_write_location;
 289
 290        for (i = 0; i < kv_count; i++) {
 291                next_write_location = hv_copyto_ringbuffer(outring_info,
 292                                                     next_write_location,
 293                                                     kv_list[i].iov_base,
 294                                                     kv_list[i].iov_len);
 295        }
 296
 297        /*
 298         * Allocate the request ID after the data has been copied into the
 299         * ring buffer.  Once this request ID is allocated, the completion
 300         * path could find the data and free it.
 301         */
 302
 303        if (desc->flags == VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED) {
 304                rqst_id = vmbus_next_request_id(&channel->requestor, requestid);
 305                if (rqst_id == VMBUS_RQST_ERROR) {
 306                        spin_unlock_irqrestore(&outring_info->ring_lock, flags);
 307                        return -EAGAIN;
 308                }
 309        }
 310        desc = hv_get_ring_buffer(outring_info) + old_write;
 311        desc->trans_id = (rqst_id == VMBUS_NO_RQSTOR) ? requestid : rqst_id;
 312
 313        /* Set previous packet start */
 314        prev_indices = hv_get_ring_bufferindices(outring_info);
 315
 316        next_write_location = hv_copyto_ringbuffer(outring_info,
 317                                             next_write_location,
 318                                             &prev_indices,
 319                                             sizeof(u64));
 320
 321        /* Issue a full memory barrier before updating the write index */
 322        virt_mb();
 323
 324        /* Now, update the write location */
 325        hv_set_next_write_location(outring_info, next_write_location);
 326
 327
 328        spin_unlock_irqrestore(&outring_info->ring_lock, flags);
 329
 330        hv_signal_on_write(old_write, channel);
 331
 332        if (channel->rescind) {
 333                if (rqst_id != VMBUS_NO_RQSTOR) {
 334                        /* Reclaim request ID to avoid leak of IDs */
 335                        vmbus_request_addr(&channel->requestor, rqst_id);
 336                }
 337                return -ENODEV;
 338        }
 339
 340        return 0;
 341}
 342
 343int hv_ringbuffer_read(struct vmbus_channel *channel,
 344                       void *buffer, u32 buflen, u32 *buffer_actual_len,
 345                       u64 *requestid, bool raw)
 346{
 347        struct vmpacket_descriptor *desc;
 348        u32 packetlen, offset;
 349
 350        if (unlikely(buflen == 0))
 351                return -EINVAL;
 352
 353        *buffer_actual_len = 0;
 354        *requestid = 0;
 355
 356        /* Make sure there is something to read */
 357        desc = hv_pkt_iter_first(channel);
 358        if (desc == NULL) {
 359                /*
 360                 * No error is set when there is even no header, drivers are
 361                 * supposed to analyze buffer_actual_len.
 362                 */
 363                return 0;
 364        }
 365
 366        offset = raw ? 0 : (desc->offset8 << 3);
 367        packetlen = (desc->len8 << 3) - offset;
 368        *buffer_actual_len = packetlen;
 369        *requestid = desc->trans_id;
 370
 371        if (unlikely(packetlen > buflen))
 372                return -ENOBUFS;
 373
 374        /* since ring is double mapped, only one copy is necessary */
 375        memcpy(buffer, (const char *)desc + offset, packetlen);
 376
 377        /* Advance ring index to next packet descriptor */
 378        __hv_pkt_iter_next(channel, desc);
 379
 380        /* Notify host of update */
 381        hv_pkt_iter_close(channel);
 382
 383        return 0;
 384}
 385
 386/*
 387 * Determine number of bytes available in ring buffer after
 388 * the current iterator (priv_read_index) location.
 389 *
 390 * This is similar to hv_get_bytes_to_read but with private
 391 * read index instead.
 392 */
 393static u32 hv_pkt_iter_avail(const struct hv_ring_buffer_info *rbi)
 394{
 395        u32 priv_read_loc = rbi->priv_read_index;
 396        u32 write_loc = READ_ONCE(rbi->ring_buffer->write_index);
 397
 398        if (write_loc >= priv_read_loc)
 399                return write_loc - priv_read_loc;
 400        else
 401                return (rbi->ring_datasize - priv_read_loc) + write_loc;
 402}
 403
 404/*
 405 * Get first vmbus packet from ring buffer after read_index
 406 *
 407 * If ring buffer is empty, returns NULL and no other action needed.
 408 */
 409struct vmpacket_descriptor *hv_pkt_iter_first(struct vmbus_channel *channel)
 410{
 411        struct hv_ring_buffer_info *rbi = &channel->inbound;
 412        struct vmpacket_descriptor *desc;
 413
 414        hv_debug_delay_test(channel, MESSAGE_DELAY);
 415        if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor))
 416                return NULL;
 417
 418        desc = hv_get_ring_buffer(rbi) + rbi->priv_read_index;
 419        if (desc)
 420                prefetch((char *)desc + (desc->len8 << 3));
 421
 422        return desc;
 423}
 424EXPORT_SYMBOL_GPL(hv_pkt_iter_first);
 425
 426/*
 427 * Get next vmbus packet from ring buffer.
 428 *
 429 * Advances the current location (priv_read_index) and checks for more
 430 * data. If the end of the ring buffer is reached, then return NULL.
 431 */
 432struct vmpacket_descriptor *
 433__hv_pkt_iter_next(struct vmbus_channel *channel,
 434                   const struct vmpacket_descriptor *desc)
 435{
 436        struct hv_ring_buffer_info *rbi = &channel->inbound;
 437        u32 packetlen = desc->len8 << 3;
 438        u32 dsize = rbi->ring_datasize;
 439
 440        hv_debug_delay_test(channel, MESSAGE_DELAY);
 441        /* bump offset to next potential packet */
 442        rbi->priv_read_index += packetlen + VMBUS_PKT_TRAILER;
 443        if (rbi->priv_read_index >= dsize)
 444                rbi->priv_read_index -= dsize;
 445
 446        /* more data? */
 447        return hv_pkt_iter_first(channel);
 448}
 449EXPORT_SYMBOL_GPL(__hv_pkt_iter_next);
 450
 451/* How many bytes were read in this iterator cycle */
 452static u32 hv_pkt_iter_bytes_read(const struct hv_ring_buffer_info *rbi,
 453                                        u32 start_read_index)
 454{
 455        if (rbi->priv_read_index >= start_read_index)
 456                return rbi->priv_read_index - start_read_index;
 457        else
 458                return rbi->ring_datasize - start_read_index +
 459                        rbi->priv_read_index;
 460}
 461
 462/*
 463 * Update host ring buffer after iterating over packets. If the host has
 464 * stopped queuing new entries because it found the ring buffer full, and
 465 * sufficient space is being freed up, signal the host. But be careful to
 466 * only signal the host when necessary, both for performance reasons and
 467 * because Hyper-V protects itself by throttling guests that signal
 468 * inappropriately.
 469 *
 470 * Determining when to signal is tricky. There are three key data inputs
 471 * that must be handled in this order to avoid race conditions:
 472 *
 473 * 1. Update the read_index
 474 * 2. Read the pending_send_sz
 475 * 3. Read the current write_index
 476 *
 477 * The interrupt_mask is not used to determine when to signal. The
 478 * interrupt_mask is used only on the guest->host ring buffer when
 479 * sending requests to the host. The host does not use it on the host->
 480 * guest ring buffer to indicate whether it should be signaled.
 481 */
 482void hv_pkt_iter_close(struct vmbus_channel *channel)
 483{
 484        struct hv_ring_buffer_info *rbi = &channel->inbound;
 485        u32 curr_write_sz, pending_sz, bytes_read, start_read_index;
 486
 487        /*
 488         * Make sure all reads are done before we update the read index since
 489         * the writer may start writing to the read area once the read index
 490         * is updated.
 491         */
 492        virt_rmb();
 493        start_read_index = rbi->ring_buffer->read_index;
 494        rbi->ring_buffer->read_index = rbi->priv_read_index;
 495
 496        /*
 497         * Older versions of Hyper-V (before WS2102 and Win8) do not
 498         * implement pending_send_sz and simply poll if the host->guest
 499         * ring buffer is full.  No signaling is needed or expected.
 500         */
 501        if (!rbi->ring_buffer->feature_bits.feat_pending_send_sz)
 502                return;
 503
 504        /*
 505         * Issue a full memory barrier before making the signaling decision.
 506         * If reading pending_send_sz were to be reordered and happen
 507         * before we commit the new read_index, a race could occur.  If the
 508         * host were to set the pending_send_sz after we have sampled
 509         * pending_send_sz, and the ring buffer blocks before we commit the
 510         * read index, we could miss sending the interrupt. Issue a full
 511         * memory barrier to address this.
 512         */
 513        virt_mb();
 514
 515        /*
 516         * If the pending_send_sz is zero, then the ring buffer is not
 517         * blocked and there is no need to signal.  This is far by the
 518         * most common case, so exit quickly for best performance.
 519         */
 520        pending_sz = READ_ONCE(rbi->ring_buffer->pending_send_sz);
 521        if (!pending_sz)
 522                return;
 523
 524        /*
 525         * Ensure the read of write_index in hv_get_bytes_to_write()
 526         * happens after the read of pending_send_sz.
 527         */
 528        virt_rmb();
 529        curr_write_sz = hv_get_bytes_to_write(rbi);
 530        bytes_read = hv_pkt_iter_bytes_read(rbi, start_read_index);
 531
 532        /*
 533         * We want to signal the host only if we're transitioning
 534         * from a "not enough free space" state to a "enough free
 535         * space" state.  For example, it's possible that this function
 536         * could run and free up enough space to signal the host, and then
 537         * run again and free up additional space before the host has a
 538         * chance to clear the pending_send_sz.  The 2nd invocation would
 539         * be a null transition from "enough free space" to "enough free
 540         * space", which doesn't warrant a signal.
 541         *
 542         * Exactly filling the ring buffer is treated as "not enough
 543         * space". The ring buffer always must have at least one byte
 544         * empty so the empty and full conditions are distinguishable.
 545         * hv_get_bytes_to_write() doesn't fully tell the truth in
 546         * this regard.
 547         *
 548         * So first check if we were in the "enough free space" state
 549         * before we began the iteration. If so, the host was not
 550         * blocked, and there's no need to signal.
 551         */
 552        if (curr_write_sz - bytes_read > pending_sz)
 553                return;
 554
 555        /*
 556         * Similarly, if the new state is "not enough space", then
 557         * there's no need to signal.
 558         */
 559        if (curr_write_sz <= pending_sz)
 560                return;
 561
 562        ++channel->intr_in_full;
 563        vmbus_setevent(channel);
 564}
 565EXPORT_SYMBOL_GPL(hv_pkt_iter_close);
 566