linux/drivers/hv/ring_buffer.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *
   4 * Copyright (c) 2009, Microsoft Corporation.
   5 *
   6 * Authors:
   7 *   Haiyang Zhang <haiyangz@microsoft.com>
   8 *   Hank Janssen  <hjanssen@microsoft.com>
   9 *   K. Y. Srinivasan <kys@microsoft.com>
  10 */
  11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  12
  13#include <linux/kernel.h>
  14#include <linux/mm.h>
  15#include <linux/hyperv.h>
  16#include <linux/uio.h>
  17#include <linux/vmalloc.h>
  18#include <linux/slab.h>
  19#include <linux/prefetch.h>
  20
  21#include "hyperv_vmbus.h"
  22
  23#define VMBUS_PKT_TRAILER       8
  24
  25/*
  26 * When we write to the ring buffer, check if the host needs to
  27 * be signaled. Here is the details of this protocol:
  28 *
  29 *      1. The host guarantees that while it is draining the
  30 *         ring buffer, it will set the interrupt_mask to
  31 *         indicate it does not need to be interrupted when
  32 *         new data is placed.
  33 *
  34 *      2. The host guarantees that it will completely drain
  35 *         the ring buffer before exiting the read loop. Further,
  36 *         once the ring buffer is empty, it will clear the
  37 *         interrupt_mask and re-check to see if new data has
  38 *         arrived.
  39 *
  40 * KYS: Oct. 30, 2016:
  41 * It looks like Windows hosts have logic to deal with DOS attacks that
  42 * can be triggered if it receives interrupts when it is not expecting
  43 * the interrupt. The host expects interrupts only when the ring
  44 * transitions from empty to non-empty (or full to non full on the guest
  45 * to host ring).
  46 * So, base the signaling decision solely on the ring state until the
  47 * host logic is fixed.
  48 */
  49
  50static void hv_signal_on_write(u32 old_write, struct vmbus_channel *channel)
  51{
  52        struct hv_ring_buffer_info *rbi = &channel->outbound;
  53
  54        virt_mb();
  55        if (READ_ONCE(rbi->ring_buffer->interrupt_mask))
  56                return;
  57
  58        /* check interrupt_mask before read_index */
  59        virt_rmb();
  60        /*
  61         * This is the only case we need to signal when the
  62         * ring transitions from being empty to non-empty.
  63         */
  64        if (old_write == READ_ONCE(rbi->ring_buffer->read_index)) {
  65                ++channel->intr_out_empty;
  66                vmbus_setevent(channel);
  67        }
  68}
  69
  70/* Get the next write location for the specified ring buffer. */
  71static inline u32
  72hv_get_next_write_location(struct hv_ring_buffer_info *ring_info)
  73{
  74        u32 next = ring_info->ring_buffer->write_index;
  75
  76        return next;
  77}
  78
  79/* Set the next write location for the specified ring buffer. */
  80static inline void
  81hv_set_next_write_location(struct hv_ring_buffer_info *ring_info,
  82                     u32 next_write_location)
  83{
  84        ring_info->ring_buffer->write_index = next_write_location;
  85}
  86
  87/* Get the size of the ring buffer. */
  88static inline u32
  89hv_get_ring_buffersize(const struct hv_ring_buffer_info *ring_info)
  90{
  91        return ring_info->ring_datasize;
  92}
  93
  94/* Get the read and write indices as u64 of the specified ring buffer. */
  95static inline u64
  96hv_get_ring_bufferindices(struct hv_ring_buffer_info *ring_info)
  97{
  98        return (u64)ring_info->ring_buffer->write_index << 32;
  99}
 100
 101/*
 102 * Helper routine to copy from source to ring buffer.
 103 * Assume there is enough room. Handles wrap-around in dest case only!!
 104 */
 105static u32 hv_copyto_ringbuffer(
 106        struct hv_ring_buffer_info      *ring_info,
 107        u32                             start_write_offset,
 108        const void                      *src,
 109        u32                             srclen)
 110{
 111        void *ring_buffer = hv_get_ring_buffer(ring_info);
 112        u32 ring_buffer_size = hv_get_ring_buffersize(ring_info);
 113
 114        memcpy(ring_buffer + start_write_offset, src, srclen);
 115
 116        start_write_offset += srclen;
 117        if (start_write_offset >= ring_buffer_size)
 118                start_write_offset -= ring_buffer_size;
 119
 120        return start_write_offset;
 121}
 122
 123/*
 124 *
 125 * hv_get_ringbuffer_availbytes()
 126 *
 127 * Get number of bytes available to read and to write to
 128 * for the specified ring buffer
 129 */
 130static void
 131hv_get_ringbuffer_availbytes(const struct hv_ring_buffer_info *rbi,
 132                             u32 *read, u32 *write)
 133{
 134        u32 read_loc, write_loc, dsize;
 135
 136        /* Capture the read/write indices before they changed */
 137        read_loc = READ_ONCE(rbi->ring_buffer->read_index);
 138        write_loc = READ_ONCE(rbi->ring_buffer->write_index);
 139        dsize = rbi->ring_datasize;
 140
 141        *write = write_loc >= read_loc ? dsize - (write_loc - read_loc) :
 142                read_loc - write_loc;
 143        *read = dsize - *write;
 144}
 145
 146/* Get various debug metrics for the specified ring buffer. */
 147int hv_ringbuffer_get_debuginfo(struct hv_ring_buffer_info *ring_info,
 148                                struct hv_ring_buffer_debug_info *debug_info)
 149{
 150        u32 bytes_avail_towrite;
 151        u32 bytes_avail_toread;
 152
 153        mutex_lock(&ring_info->ring_buffer_mutex);
 154
 155        if (!ring_info->ring_buffer) {
 156                mutex_unlock(&ring_info->ring_buffer_mutex);
 157                return -EINVAL;
 158        }
 159
 160        hv_get_ringbuffer_availbytes(ring_info,
 161                                     &bytes_avail_toread,
 162                                     &bytes_avail_towrite);
 163        debug_info->bytes_avail_toread = bytes_avail_toread;
 164        debug_info->bytes_avail_towrite = bytes_avail_towrite;
 165        debug_info->current_read_index = ring_info->ring_buffer->read_index;
 166        debug_info->current_write_index = ring_info->ring_buffer->write_index;
 167        debug_info->current_interrupt_mask
 168                = ring_info->ring_buffer->interrupt_mask;
 169        mutex_unlock(&ring_info->ring_buffer_mutex);
 170
 171        return 0;
 172}
 173EXPORT_SYMBOL_GPL(hv_ringbuffer_get_debuginfo);
 174
 175/* Initialize a channel's ring buffer info mutex locks */
 176void hv_ringbuffer_pre_init(struct vmbus_channel *channel)
 177{
 178        mutex_init(&channel->inbound.ring_buffer_mutex);
 179        mutex_init(&channel->outbound.ring_buffer_mutex);
 180}
 181
 182/* Initialize the ring buffer. */
 183int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
 184                       struct page *pages, u32 page_cnt, u32 max_pkt_size)
 185{
 186        int i;
 187        struct page **pages_wraparound;
 188
 189        BUILD_BUG_ON((sizeof(struct hv_ring_buffer) != PAGE_SIZE));
 190
 191        /*
 192         * First page holds struct hv_ring_buffer, do wraparound mapping for
 193         * the rest.
 194         */
 195        pages_wraparound = kcalloc(page_cnt * 2 - 1, sizeof(struct page *),
 196                                   GFP_KERNEL);
 197        if (!pages_wraparound)
 198                return -ENOMEM;
 199
 200        pages_wraparound[0] = pages;
 201        for (i = 0; i < 2 * (page_cnt - 1); i++)
 202                pages_wraparound[i + 1] = &pages[i % (page_cnt - 1) + 1];
 203
 204        ring_info->ring_buffer = (struct hv_ring_buffer *)
 205                vmap(pages_wraparound, page_cnt * 2 - 1, VM_MAP, PAGE_KERNEL);
 206
 207        kfree(pages_wraparound);
 208
 209
 210        if (!ring_info->ring_buffer)
 211                return -ENOMEM;
 212
 213        ring_info->ring_buffer->read_index =
 214                ring_info->ring_buffer->write_index = 0;
 215
 216        /* Set the feature bit for enabling flow control. */
 217        ring_info->ring_buffer->feature_bits.value = 1;
 218
 219        ring_info->ring_size = page_cnt << PAGE_SHIFT;
 220        ring_info->ring_size_div10_reciprocal =
 221                reciprocal_value(ring_info->ring_size / 10);
 222        ring_info->ring_datasize = ring_info->ring_size -
 223                sizeof(struct hv_ring_buffer);
 224        ring_info->priv_read_index = 0;
 225
 226        /* Initialize buffer that holds copies of incoming packets */
 227        if (max_pkt_size) {
 228                ring_info->pkt_buffer = kzalloc(max_pkt_size, GFP_KERNEL);
 229                if (!ring_info->pkt_buffer)
 230                        return -ENOMEM;
 231                ring_info->pkt_buffer_size = max_pkt_size;
 232        }
 233
 234        spin_lock_init(&ring_info->ring_lock);
 235
 236        return 0;
 237}
 238
 239/* Cleanup the ring buffer. */
 240void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info)
 241{
 242        mutex_lock(&ring_info->ring_buffer_mutex);
 243        vunmap(ring_info->ring_buffer);
 244        ring_info->ring_buffer = NULL;
 245        mutex_unlock(&ring_info->ring_buffer_mutex);
 246
 247        kfree(ring_info->pkt_buffer);
 248        ring_info->pkt_buffer = NULL;
 249        ring_info->pkt_buffer_size = 0;
 250}
 251
 252/* Write to the ring buffer. */
 253int hv_ringbuffer_write(struct vmbus_channel *channel,
 254                        const struct kvec *kv_list, u32 kv_count,
 255                        u64 requestid)
 256{
 257        int i;
 258        u32 bytes_avail_towrite;
 259        u32 totalbytes_towrite = sizeof(u64);
 260        u32 next_write_location;
 261        u32 old_write;
 262        u64 prev_indices;
 263        unsigned long flags;
 264        struct hv_ring_buffer_info *outring_info = &channel->outbound;
 265        struct vmpacket_descriptor *desc = kv_list[0].iov_base;
 266        u64 rqst_id = VMBUS_NO_RQSTOR;
 267
 268        if (channel->rescind)
 269                return -ENODEV;
 270
 271        for (i = 0; i < kv_count; i++)
 272                totalbytes_towrite += kv_list[i].iov_len;
 273
 274        spin_lock_irqsave(&outring_info->ring_lock, flags);
 275
 276        bytes_avail_towrite = hv_get_bytes_to_write(outring_info);
 277
 278        /*
 279         * If there is only room for the packet, assume it is full.
 280         * Otherwise, the next time around, we think the ring buffer
 281         * is empty since the read index == write index.
 282         */
 283        if (bytes_avail_towrite <= totalbytes_towrite) {
 284                ++channel->out_full_total;
 285
 286                if (!channel->out_full_flag) {
 287                        ++channel->out_full_first;
 288                        channel->out_full_flag = true;
 289                }
 290
 291                spin_unlock_irqrestore(&outring_info->ring_lock, flags);
 292                return -EAGAIN;
 293        }
 294
 295        channel->out_full_flag = false;
 296
 297        /* Write to the ring buffer */
 298        next_write_location = hv_get_next_write_location(outring_info);
 299
 300        old_write = next_write_location;
 301
 302        for (i = 0; i < kv_count; i++) {
 303                next_write_location = hv_copyto_ringbuffer(outring_info,
 304                                                     next_write_location,
 305                                                     kv_list[i].iov_base,
 306                                                     kv_list[i].iov_len);
 307        }
 308
 309        /*
 310         * Allocate the request ID after the data has been copied into the
 311         * ring buffer.  Once this request ID is allocated, the completion
 312         * path could find the data and free it.
 313         */
 314
 315        if (desc->flags == VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED) {
 316                if (channel->next_request_id_callback != NULL) {
 317                        rqst_id = channel->next_request_id_callback(channel, requestid);
 318                        if (rqst_id == VMBUS_RQST_ERROR) {
 319                                spin_unlock_irqrestore(&outring_info->ring_lock, flags);
 320                                return -EAGAIN;
 321                        }
 322                }
 323        }
 324        desc = hv_get_ring_buffer(outring_info) + old_write;
 325        desc->trans_id = (rqst_id == VMBUS_NO_RQSTOR) ? requestid : rqst_id;
 326
 327        /* Set previous packet start */
 328        prev_indices = hv_get_ring_bufferindices(outring_info);
 329
 330        next_write_location = hv_copyto_ringbuffer(outring_info,
 331                                             next_write_location,
 332                                             &prev_indices,
 333                                             sizeof(u64));
 334
 335        /* Issue a full memory barrier before updating the write index */
 336        virt_mb();
 337
 338        /* Now, update the write location */
 339        hv_set_next_write_location(outring_info, next_write_location);
 340
 341
 342        spin_unlock_irqrestore(&outring_info->ring_lock, flags);
 343
 344        hv_signal_on_write(old_write, channel);
 345
 346        if (channel->rescind) {
 347                if (rqst_id != VMBUS_NO_RQSTOR) {
 348                        /* Reclaim request ID to avoid leak of IDs */
 349                        if (channel->request_addr_callback != NULL)
 350                                channel->request_addr_callback(channel, rqst_id);
 351                }
 352                return -ENODEV;
 353        }
 354
 355        return 0;
 356}
 357
 358int hv_ringbuffer_read(struct vmbus_channel *channel,
 359                       void *buffer, u32 buflen, u32 *buffer_actual_len,
 360                       u64 *requestid, bool raw)
 361{
 362        struct vmpacket_descriptor *desc;
 363        u32 packetlen, offset;
 364
 365        if (unlikely(buflen == 0))
 366                return -EINVAL;
 367
 368        *buffer_actual_len = 0;
 369        *requestid = 0;
 370
 371        /* Make sure there is something to read */
 372        desc = hv_pkt_iter_first(channel);
 373        if (desc == NULL) {
 374                /*
 375                 * No error is set when there is even no header, drivers are
 376                 * supposed to analyze buffer_actual_len.
 377                 */
 378                return 0;
 379        }
 380
 381        offset = raw ? 0 : (desc->offset8 << 3);
 382        packetlen = (desc->len8 << 3) - offset;
 383        *buffer_actual_len = packetlen;
 384        *requestid = desc->trans_id;
 385
 386        if (unlikely(packetlen > buflen))
 387                return -ENOBUFS;
 388
 389        /* since ring is double mapped, only one copy is necessary */
 390        memcpy(buffer, (const char *)desc + offset, packetlen);
 391
 392        /* Advance ring index to next packet descriptor */
 393        __hv_pkt_iter_next(channel, desc, true);
 394
 395        /* Notify host of update */
 396        hv_pkt_iter_close(channel);
 397
 398        return 0;
 399}
 400
 401/*
 402 * Determine number of bytes available in ring buffer after
 403 * the current iterator (priv_read_index) location.
 404 *
 405 * This is similar to hv_get_bytes_to_read but with private
 406 * read index instead.
 407 */
 408static u32 hv_pkt_iter_avail(const struct hv_ring_buffer_info *rbi)
 409{
 410        u32 priv_read_loc = rbi->priv_read_index;
 411        u32 write_loc = READ_ONCE(rbi->ring_buffer->write_index);
 412
 413        if (write_loc >= priv_read_loc)
 414                return write_loc - priv_read_loc;
 415        else
 416                return (rbi->ring_datasize - priv_read_loc) + write_loc;
 417}
 418
 419/*
 420 * Get first vmbus packet without copying it out of the ring buffer
 421 */
 422struct vmpacket_descriptor *hv_pkt_iter_first_raw(struct vmbus_channel *channel)
 423{
 424        struct hv_ring_buffer_info *rbi = &channel->inbound;
 425
 426        hv_debug_delay_test(channel, MESSAGE_DELAY);
 427
 428        if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor))
 429                return NULL;
 430
 431        return (struct vmpacket_descriptor *)(hv_get_ring_buffer(rbi) + rbi->priv_read_index);
 432}
 433EXPORT_SYMBOL_GPL(hv_pkt_iter_first_raw);
 434
 435/*
 436 * Get first vmbus packet from ring buffer after read_index
 437 *
 438 * If ring buffer is empty, returns NULL and no other action needed.
 439 */
 440struct vmpacket_descriptor *hv_pkt_iter_first(struct vmbus_channel *channel)
 441{
 442        struct hv_ring_buffer_info *rbi = &channel->inbound;
 443        struct vmpacket_descriptor *desc, *desc_copy;
 444        u32 bytes_avail, pkt_len, pkt_offset;
 445
 446        desc = hv_pkt_iter_first_raw(channel);
 447        if (!desc)
 448                return NULL;
 449
 450        bytes_avail = min(rbi->pkt_buffer_size, hv_pkt_iter_avail(rbi));
 451
 452        /*
 453         * Ensure the compiler does not use references to incoming Hyper-V values (which
 454         * could change at any moment) when reading local variables later in the code
 455         */
 456        pkt_len = READ_ONCE(desc->len8) << 3;
 457        pkt_offset = READ_ONCE(desc->offset8) << 3;
 458
 459        /*
 460         * If pkt_len is invalid, set it to the smaller of hv_pkt_iter_avail() and
 461         * rbi->pkt_buffer_size
 462         */
 463        if (pkt_len < sizeof(struct vmpacket_descriptor) || pkt_len > bytes_avail)
 464                pkt_len = bytes_avail;
 465
 466        /*
 467         * If pkt_offset is invalid, arbitrarily set it to
 468         * the size of vmpacket_descriptor
 469         */
 470        if (pkt_offset < sizeof(struct vmpacket_descriptor) || pkt_offset > pkt_len)
 471                pkt_offset = sizeof(struct vmpacket_descriptor);
 472
 473        /* Copy the Hyper-V packet out of the ring buffer */
 474        desc_copy = (struct vmpacket_descriptor *)rbi->pkt_buffer;
 475        memcpy(desc_copy, desc, pkt_len);
 476
 477        /*
 478         * Hyper-V could still change len8 and offset8 after the earlier read.
 479         * Ensure that desc_copy has legal values for len8 and offset8 that
 480         * are consistent with the copy we just made
 481         */
 482        desc_copy->len8 = pkt_len >> 3;
 483        desc_copy->offset8 = pkt_offset >> 3;
 484
 485        return desc_copy;
 486}
 487EXPORT_SYMBOL_GPL(hv_pkt_iter_first);
 488
 489/*
 490 * Get next vmbus packet from ring buffer.
 491 *
 492 * Advances the current location (priv_read_index) and checks for more
 493 * data. If the end of the ring buffer is reached, then return NULL.
 494 */
 495struct vmpacket_descriptor *
 496__hv_pkt_iter_next(struct vmbus_channel *channel,
 497                   const struct vmpacket_descriptor *desc,
 498                   bool copy)
 499{
 500        struct hv_ring_buffer_info *rbi = &channel->inbound;
 501        u32 packetlen = desc->len8 << 3;
 502        u32 dsize = rbi->ring_datasize;
 503
 504        hv_debug_delay_test(channel, MESSAGE_DELAY);
 505        /* bump offset to next potential packet */
 506        rbi->priv_read_index += packetlen + VMBUS_PKT_TRAILER;
 507        if (rbi->priv_read_index >= dsize)
 508                rbi->priv_read_index -= dsize;
 509
 510        /* more data? */
 511        return copy ? hv_pkt_iter_first(channel) : hv_pkt_iter_first_raw(channel);
 512}
 513EXPORT_SYMBOL_GPL(__hv_pkt_iter_next);
 514
 515/* How many bytes were read in this iterator cycle */
 516static u32 hv_pkt_iter_bytes_read(const struct hv_ring_buffer_info *rbi,
 517                                        u32 start_read_index)
 518{
 519        if (rbi->priv_read_index >= start_read_index)
 520                return rbi->priv_read_index - start_read_index;
 521        else
 522                return rbi->ring_datasize - start_read_index +
 523                        rbi->priv_read_index;
 524}
 525
 526/*
 527 * Update host ring buffer after iterating over packets. If the host has
 528 * stopped queuing new entries because it found the ring buffer full, and
 529 * sufficient space is being freed up, signal the host. But be careful to
 530 * only signal the host when necessary, both for performance reasons and
 531 * because Hyper-V protects itself by throttling guests that signal
 532 * inappropriately.
 533 *
 534 * Determining when to signal is tricky. There are three key data inputs
 535 * that must be handled in this order to avoid race conditions:
 536 *
 537 * 1. Update the read_index
 538 * 2. Read the pending_send_sz
 539 * 3. Read the current write_index
 540 *
 541 * The interrupt_mask is not used to determine when to signal. The
 542 * interrupt_mask is used only on the guest->host ring buffer when
 543 * sending requests to the host. The host does not use it on the host->
 544 * guest ring buffer to indicate whether it should be signaled.
 545 */
 546void hv_pkt_iter_close(struct vmbus_channel *channel)
 547{
 548        struct hv_ring_buffer_info *rbi = &channel->inbound;
 549        u32 curr_write_sz, pending_sz, bytes_read, start_read_index;
 550
 551        /*
 552         * Make sure all reads are done before we update the read index since
 553         * the writer may start writing to the read area once the read index
 554         * is updated.
 555         */
 556        virt_rmb();
 557        start_read_index = rbi->ring_buffer->read_index;
 558        rbi->ring_buffer->read_index = rbi->priv_read_index;
 559
 560        /*
 561         * Older versions of Hyper-V (before WS2102 and Win8) do not
 562         * implement pending_send_sz and simply poll if the host->guest
 563         * ring buffer is full.  No signaling is needed or expected.
 564         */
 565        if (!rbi->ring_buffer->feature_bits.feat_pending_send_sz)
 566                return;
 567
 568        /*
 569         * Issue a full memory barrier before making the signaling decision.
 570         * If reading pending_send_sz were to be reordered and happen
 571         * before we commit the new read_index, a race could occur.  If the
 572         * host were to set the pending_send_sz after we have sampled
 573         * pending_send_sz, and the ring buffer blocks before we commit the
 574         * read index, we could miss sending the interrupt. Issue a full
 575         * memory barrier to address this.
 576         */
 577        virt_mb();
 578
 579        /*
 580         * If the pending_send_sz is zero, then the ring buffer is not
 581         * blocked and there is no need to signal.  This is far by the
 582         * most common case, so exit quickly for best performance.
 583         */
 584        pending_sz = READ_ONCE(rbi->ring_buffer->pending_send_sz);
 585        if (!pending_sz)
 586                return;
 587
 588        /*
 589         * Ensure the read of write_index in hv_get_bytes_to_write()
 590         * happens after the read of pending_send_sz.
 591         */
 592        virt_rmb();
 593        curr_write_sz = hv_get_bytes_to_write(rbi);
 594        bytes_read = hv_pkt_iter_bytes_read(rbi, start_read_index);
 595
 596        /*
 597         * We want to signal the host only if we're transitioning
 598         * from a "not enough free space" state to a "enough free
 599         * space" state.  For example, it's possible that this function
 600         * could run and free up enough space to signal the host, and then
 601         * run again and free up additional space before the host has a
 602         * chance to clear the pending_send_sz.  The 2nd invocation would
 603         * be a null transition from "enough free space" to "enough free
 604         * space", which doesn't warrant a signal.
 605         *
 606         * Exactly filling the ring buffer is treated as "not enough
 607         * space". The ring buffer always must have at least one byte
 608         * empty so the empty and full conditions are distinguishable.
 609         * hv_get_bytes_to_write() doesn't fully tell the truth in
 610         * this regard.
 611         *
 612         * So first check if we were in the "enough free space" state
 613         * before we began the iteration. If so, the host was not
 614         * blocked, and there's no need to signal.
 615         */
 616        if (curr_write_sz - bytes_read > pending_sz)
 617                return;
 618
 619        /*
 620         * Similarly, if the new state is "not enough space", then
 621         * there's no need to signal.
 622         */
 623        if (curr_write_sz <= pending_sz)
 624                return;
 625
 626        ++channel->intr_in_full;
 627        vmbus_setevent(channel);
 628}
 629EXPORT_SYMBOL_GPL(hv_pkt_iter_close);
 630