LXR qemu/net/vhost-vdpa.c

   1/*
   2 * vhost-vdpa.c
   3 *
   4 * Copyright(c) 2017-2018 Intel Corporation.
   5 * Copyright(c) 2020 Red Hat, Inc.
   6 *
   7 * This work is licensed under the terms of the GNU GPL, version 2 or later.
   8 * See the COPYING file in the top-level directory.
   9 *
  10 */
  11
  12#include "qemu/osdep.h"
  13#include "clients.h"
  14#include "hw/virtio/virtio-net.h"
  15#include "net/vhost_net.h"
  16#include "net/vhost-vdpa.h"
  17#include "hw/virtio/vhost-vdpa.h"
  18#include "qemu/config-file.h"
  19#include "qemu/error-report.h"
  20#include "qemu/log.h"
  21#include "qemu/memalign.h"
  22#include "qemu/option.h"
  23#include "qapi/error.h"
  24#include <linux/vhost.h>
  25#include <sys/ioctl.h>
  26#include <err.h>
  27#include "standard-headers/linux/virtio_net.h"
  28#include "monitor/monitor.h"
  29#include "migration/migration.h"
  30#include "migration/misc.h"
  31#include "hw/virtio/vhost.h"
  32
  33/* Todo:need to add the multiqueue support here */
  34typedef struct VhostVDPAState {
  35    NetClientState nc;
  36    struct vhost_vdpa vhost_vdpa;
  37    Notifier migration_state;
  38    VHostNetState *vhost_net;
  39
  40    /* Control commands shadow buffers */
  41    void *cvq_cmd_out_buffer;
  42    virtio_net_ctrl_ack *status;
  43
  44    /* The device always have SVQ enabled */
  45    bool always_svq;
  46    bool started;
  47} VhostVDPAState;
  48
  49const int vdpa_feature_bits[] = {
  50    VIRTIO_F_NOTIFY_ON_EMPTY,
  51    VIRTIO_RING_F_INDIRECT_DESC,
  52    VIRTIO_RING_F_EVENT_IDX,
  53    VIRTIO_F_ANY_LAYOUT,
  54    VIRTIO_F_VERSION_1,
  55    VIRTIO_NET_F_CSUM,
  56    VIRTIO_NET_F_GUEST_CSUM,
  57    VIRTIO_NET_F_GSO,
  58    VIRTIO_NET_F_GUEST_TSO4,
  59    VIRTIO_NET_F_GUEST_TSO6,
  60    VIRTIO_NET_F_GUEST_ECN,
  61    VIRTIO_NET_F_GUEST_UFO,
  62    VIRTIO_NET_F_HOST_TSO4,
  63    VIRTIO_NET_F_HOST_TSO6,
  64    VIRTIO_NET_F_HOST_ECN,
  65    VIRTIO_NET_F_HOST_UFO,
  66    VIRTIO_NET_F_MRG_RXBUF,
  67    VIRTIO_NET_F_MTU,
  68    VIRTIO_NET_F_CTRL_RX,
  69    VIRTIO_NET_F_CTRL_RX_EXTRA,
  70    VIRTIO_NET_F_CTRL_VLAN,
  71    VIRTIO_NET_F_CTRL_MAC_ADDR,
  72    VIRTIO_NET_F_RSS,
  73    VIRTIO_NET_F_MQ,
  74    VIRTIO_NET_F_CTRL_VQ,
  75    VIRTIO_F_IOMMU_PLATFORM,
  76    VIRTIO_F_RING_PACKED,
  77    VIRTIO_F_RING_RESET,
  78    VIRTIO_NET_F_RSS,
  79    VIRTIO_NET_F_HASH_REPORT,
  80    VIRTIO_NET_F_STATUS,
  81    VHOST_INVALID_FEATURE_BIT
  82};
  83
  84/** Supported device specific feature bits with SVQ */
  85static const uint64_t vdpa_svq_device_features =
  86    BIT_ULL(VIRTIO_NET_F_CSUM) |
  87    BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |
  88    BIT_ULL(VIRTIO_NET_F_MTU) |
  89    BIT_ULL(VIRTIO_NET_F_MAC) |
  90    BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) |
  91    BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |
  92    BIT_ULL(VIRTIO_NET_F_GUEST_ECN) |
  93    BIT_ULL(VIRTIO_NET_F_GUEST_UFO) |
  94    BIT_ULL(VIRTIO_NET_F_HOST_TSO4) |
  95    BIT_ULL(VIRTIO_NET_F_HOST_TSO6) |
  96    BIT_ULL(VIRTIO_NET_F_HOST_ECN) |
  97    BIT_ULL(VIRTIO_NET_F_HOST_UFO) |
  98    BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) |
  99    BIT_ULL(VIRTIO_NET_F_STATUS) |
 100    BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |
 101    BIT_ULL(VIRTIO_NET_F_MQ) |
 102    BIT_ULL(VIRTIO_F_ANY_LAYOUT) |
 103    BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) |
 104    /* VHOST_F_LOG_ALL is exposed by SVQ */
 105    BIT_ULL(VHOST_F_LOG_ALL) |
 106    BIT_ULL(VIRTIO_NET_F_RSC_EXT) |
 107    BIT_ULL(VIRTIO_NET_F_STANDBY);
 108
 109#define VHOST_VDPA_NET_CVQ_ASID 1
 110
 111VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc)
 112{
 113    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
 114    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
 115    return s->vhost_net;
 116}
 117
 118static bool vhost_vdpa_net_valid_svq_features(uint64_t features, Error **errp)
 119{
 120    uint64_t invalid_dev_features =
 121        features & ~vdpa_svq_device_features &
 122        /* Transport are all accepted at this point */
 123        ~MAKE_64BIT_MASK(VIRTIO_TRANSPORT_F_START,
 124                         VIRTIO_TRANSPORT_F_END - VIRTIO_TRANSPORT_F_START);
 125
 126    if (invalid_dev_features) {
 127        error_setg(errp, "vdpa svq does not work with features 0x%" PRIx64,
 128                   invalid_dev_features);
 129        return false;
 130    }
 131
 132    return vhost_svq_valid_features(features, errp);
 133}
 134
 135static int vhost_vdpa_net_check_device_id(struct vhost_net *net)
 136{
 137    uint32_t device_id;
 138    int ret;
 139    struct vhost_dev *hdev;
 140
 141    hdev = (struct vhost_dev *)&net->dev;
 142    ret = hdev->vhost_ops->vhost_get_device_id(hdev, &device_id);
 143    if (device_id != VIRTIO_ID_NET) {
 144        return -ENOTSUP;
 145    }
 146    return ret;
 147}
 148
 149static int vhost_vdpa_add(NetClientState *ncs, void *be,
 150                          int queue_pair_index, int nvqs)
 151{
 152    VhostNetOptions options;
 153    struct vhost_net *net = NULL;
 154    VhostVDPAState *s;
 155    int ret;
 156
 157    options.backend_type = VHOST_BACKEND_TYPE_VDPA;
 158    assert(ncs->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
 159    s = DO_UPCAST(VhostVDPAState, nc, ncs);
 160    options.net_backend = ncs;
 161    options.opaque      = be;
 162    options.busyloop_timeout = 0;
 163    options.nvqs = nvqs;
 164
 165    net = vhost_net_init(&options);
 166    if (!net) {
 167        error_report("failed to init vhost_net for queue");
 168        goto err_init;
 169    }
 170    s->vhost_net = net;
 171    ret = vhost_vdpa_net_check_device_id(net);
 172    if (ret) {
 173        goto err_check;
 174    }
 175    return 0;
 176err_check:
 177    vhost_net_cleanup(net);
 178    g_free(net);
 179err_init:
 180    return -1;
 181}
 182
 183static void vhost_vdpa_cleanup(NetClientState *nc)
 184{
 185    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
 186
 187    qemu_vfree(s->cvq_cmd_out_buffer);
 188    qemu_vfree(s->status);
 189    if (s->vhost_net) {
 190        vhost_net_cleanup(s->vhost_net);
 191        g_free(s->vhost_net);
 192        s->vhost_net = NULL;
 193    }
 194     if (s->vhost_vdpa.device_fd >= 0) {
 195        qemu_close(s->vhost_vdpa.device_fd);
 196        s->vhost_vdpa.device_fd = -1;
 197    }
 198}
 199
 200static bool vhost_vdpa_has_vnet_hdr(NetClientState *nc)
 201{
 202    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
 203
 204    return true;
 205}
 206
 207static bool vhost_vdpa_has_ufo(NetClientState *nc)
 208{
 209    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
 210    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
 211    uint64_t features = 0;
 212    features |= (1ULL << VIRTIO_NET_F_HOST_UFO);
 213    features = vhost_net_get_features(s->vhost_net, features);
 214    return !!(features & (1ULL << VIRTIO_NET_F_HOST_UFO));
 215
 216}
 217
 218static bool vhost_vdpa_check_peer_type(NetClientState *nc, ObjectClass *oc,
 219                                       Error **errp)
 220{
 221    const char *driver = object_class_get_name(oc);
 222
 223    if (!g_str_has_prefix(driver, "virtio-net-")) {
 224        error_setg(errp, "vhost-vdpa requires frontend driver virtio-net-*");
 225        return false;
 226    }
 227
 228    return true;
 229}
 230
 231/** Dummy receive in case qemu falls back to userland tap networking */
 232static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf,
 233                                  size_t size)
 234{
 235    return size;
 236}
 237
 238/** From any vdpa net client, get the netclient of the first queue pair */
 239static VhostVDPAState *vhost_vdpa_net_first_nc_vdpa(VhostVDPAState *s)
 240{
 241    NICState *nic = qemu_get_nic(s->nc.peer);
 242    NetClientState *nc0 = qemu_get_peer(nic->ncs, 0);
 243
 244    return DO_UPCAST(VhostVDPAState, nc, nc0);
 245}
 246
 247static void vhost_vdpa_net_log_global_enable(VhostVDPAState *s, bool enable)
 248{
 249    struct vhost_vdpa *v = &s->vhost_vdpa;
 250    VirtIONet *n;
 251    VirtIODevice *vdev;
 252    int data_queue_pairs, cvq, r;
 253
 254    /* We are only called on the first data vqs and only if x-svq is not set */
 255    if (s->vhost_vdpa.shadow_vqs_enabled == enable) {
 256        return;
 257    }
 258
 259    vdev = v->dev->vdev;
 260    n = VIRTIO_NET(vdev);
 261    if (!n->vhost_started) {
 262        return;
 263    }
 264
 265    data_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
 266    cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ?
 267                                  n->max_ncs - n->max_queue_pairs : 0;
 268    /*
 269     * TODO: vhost_net_stop does suspend, get_base and reset. We can be smarter
 270     * in the future and resume the device if read-only operations between
 271     * suspend and reset goes wrong.
 272     */
 273    vhost_net_stop(vdev, n->nic->ncs, data_queue_pairs, cvq);
 274
 275    /* Start will check migration setup_or_active to configure or not SVQ */
 276    r = vhost_net_start(vdev, n->nic->ncs, data_queue_pairs, cvq);
 277    if (unlikely(r < 0)) {
 278        error_report("unable to start vhost net: %s(%d)", g_strerror(-r), -r);
 279    }
 280}
 281
 282static void vdpa_net_migration_state_notifier(Notifier *notifier, void *data)
 283{
 284    MigrationState *migration = data;
 285    VhostVDPAState *s = container_of(notifier, VhostVDPAState,
 286                                     migration_state);
 287
 288    if (migration_in_setup(migration)) {
 289        vhost_vdpa_net_log_global_enable(s, true);
 290    } else if (migration_has_failed(migration)) {
 291        vhost_vdpa_net_log_global_enable(s, false);
 292    }
 293}
 294
 295static void vhost_vdpa_net_data_start_first(VhostVDPAState *s)
 296{
 297    struct vhost_vdpa *v = &s->vhost_vdpa;
 298
 299    add_migration_state_change_notifier(&s->migration_state);
 300    if (v->shadow_vqs_enabled) {
 301        v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
 302                                           v->iova_range.last);
 303    }
 304}
 305
 306static int vhost_vdpa_net_data_start(NetClientState *nc)
 307{
 308    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
 309    struct vhost_vdpa *v = &s->vhost_vdpa;
 310
 311    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
 312
 313    if (s->always_svq ||
 314        migration_is_setup_or_active(migrate_get_current()->state)) {
 315        v->shadow_vqs_enabled = true;
 316        v->shadow_data = true;
 317    } else {
 318        v->shadow_vqs_enabled = false;
 319        v->shadow_data = false;
 320    }
 321
 322    if (v->index == 0) {
 323        vhost_vdpa_net_data_start_first(s);
 324        return 0;
 325    }
 326
 327    if (v->shadow_vqs_enabled) {
 328        VhostVDPAState *s0 = vhost_vdpa_net_first_nc_vdpa(s);
 329        v->iova_tree = s0->vhost_vdpa.iova_tree;
 330    }
 331
 332    return 0;
 333}
 334
 335static void vhost_vdpa_net_client_stop(NetClientState *nc)
 336{
 337    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
 338    struct vhost_dev *dev;
 339
 340    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
 341
 342    if (s->vhost_vdpa.index == 0) {
 343        remove_migration_state_change_notifier(&s->migration_state);
 344    }
 345
 346    dev = s->vhost_vdpa.dev;
 347    if (dev->vq_index + dev->nvqs == dev->vq_index_end) {
 348        g_clear_pointer(&s->vhost_vdpa.iova_tree, vhost_iova_tree_delete);
 349    }
 350}
 351
 352static NetClientInfo net_vhost_vdpa_info = {
 353        .type = NET_CLIENT_DRIVER_VHOST_VDPA,
 354        .size = sizeof(VhostVDPAState),
 355        .receive = vhost_vdpa_receive,
 356        .start = vhost_vdpa_net_data_start,
 357        .stop = vhost_vdpa_net_client_stop,
 358        .cleanup = vhost_vdpa_cleanup,
 359        .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
 360        .has_ufo = vhost_vdpa_has_ufo,
 361        .check_peer_type = vhost_vdpa_check_peer_type,
 362};
 363
 364static int64_t vhost_vdpa_get_vring_group(int device_fd, unsigned vq_index)
 365{
 366    struct vhost_vring_state state = {
 367        .index = vq_index,
 368    };
 369    int r = ioctl(device_fd, VHOST_VDPA_GET_VRING_GROUP, &state);
 370
 371    if (unlikely(r < 0)) {
 372        error_report("Cannot get VQ %u group: %s", vq_index,
 373                     g_strerror(errno));
 374        return r;
 375    }
 376
 377    return state.num;
 378}
 379
 380static int vhost_vdpa_set_address_space_id(struct vhost_vdpa *v,
 381                                           unsigned vq_group,
 382                                           unsigned asid_num)
 383{
 384    struct vhost_vring_state asid = {
 385        .index = vq_group,
 386        .num = asid_num,
 387    };
 388    int r;
 389
 390    r = ioctl(v->device_fd, VHOST_VDPA_SET_GROUP_ASID, &asid);
 391    if (unlikely(r < 0)) {
 392        error_report("Can't set vq group %u asid %u, errno=%d (%s)",
 393                     asid.index, asid.num, errno, g_strerror(errno));
 394    }
 395    return r;
 396}
 397
 398static void vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa *v, void *addr)
 399{
 400    VhostIOVATree *tree = v->iova_tree;
 401    DMAMap needle = {
 402        /*
 403         * No need to specify size or to look for more translations since
 404         * this contiguous chunk was allocated by us.
 405         */
 406        .translated_addr = (hwaddr)(uintptr_t)addr,
 407    };
 408    const DMAMap *map = vhost_iova_tree_find_iova(tree, &needle);
 409    int r;
 410
 411    if (unlikely(!map)) {
 412        error_report("Cannot locate expected map");
 413        return;
 414    }
 415
 416    r = vhost_vdpa_dma_unmap(v, v->address_space_id, map->iova, map->size + 1);
 417    if (unlikely(r != 0)) {
 418        error_report("Device cannot unmap: %s(%d)", g_strerror(r), r);
 419    }
 420
 421    vhost_iova_tree_remove(tree, *map);
 422}
 423
 424static size_t vhost_vdpa_net_cvq_cmd_len(void)
 425{
 426    /*
 427     * MAC_TABLE_SET is the ctrl command that produces the longer out buffer.
 428     * In buffer is always 1 byte, so it should fit here
 429     */
 430    return sizeof(struct virtio_net_ctrl_hdr) +
 431           2 * sizeof(struct virtio_net_ctrl_mac) +
 432           MAC_TABLE_ENTRIES * ETH_ALEN;
 433}
 434
 435static size_t vhost_vdpa_net_cvq_cmd_page_len(void)
 436{
 437    return ROUND_UP(vhost_vdpa_net_cvq_cmd_len(), qemu_real_host_page_size());
 438}
 439
 440/** Map CVQ buffer. */
 441static int vhost_vdpa_cvq_map_buf(struct vhost_vdpa *v, void *buf, size_t size,
 442                                  bool write)
 443{
 444    DMAMap map = {};
 445    int r;
 446
 447    map.translated_addr = (hwaddr)(uintptr_t)buf;
 448    map.size = size - 1;
 449    map.perm = write ? IOMMU_RW : IOMMU_RO,
 450    r = vhost_iova_tree_map_alloc(v->iova_tree, &map);
 451    if (unlikely(r != IOVA_OK)) {
 452        error_report("Cannot map injected element");
 453        return r;
 454    }
 455
 456    r = vhost_vdpa_dma_map(v, v->address_space_id, map.iova,
 457                           vhost_vdpa_net_cvq_cmd_page_len(), buf, !write);
 458    if (unlikely(r < 0)) {
 459        goto dma_map_err;
 460    }
 461
 462    return 0;
 463
 464dma_map_err:
 465    vhost_iova_tree_remove(v->iova_tree, map);
 466    return r;
 467}
 468
 469static int vhost_vdpa_net_cvq_start(NetClientState *nc)
 470{
 471    VhostVDPAState *s, *s0;
 472    struct vhost_vdpa *v;
 473    uint64_t backend_features;
 474    int64_t cvq_group;
 475    int cvq_index, r;
 476
 477    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
 478
 479    s = DO_UPCAST(VhostVDPAState, nc, nc);
 480    v = &s->vhost_vdpa;
 481
 482    s0 = vhost_vdpa_net_first_nc_vdpa(s);
 483    v->shadow_data = s0->vhost_vdpa.shadow_vqs_enabled;
 484    v->shadow_vqs_enabled = s->always_svq;
 485    s->vhost_vdpa.address_space_id = VHOST_VDPA_GUEST_PA_ASID;
 486
 487    if (s->vhost_vdpa.shadow_data) {
 488        /* SVQ is already configured for all virtqueues */
 489        goto out;
 490    }
 491
 492    /*
 493     * If we early return in these cases SVQ will not be enabled. The migration
 494     * will be blocked as long as vhost-vdpa backends will not offer _F_LOG.
 495     *
 496     * Calling VHOST_GET_BACKEND_FEATURES as they are not available in v->dev
 497     * yet.
 498     */
 499    r = ioctl(v->device_fd, VHOST_GET_BACKEND_FEATURES, &backend_features);
 500    if (unlikely(r < 0)) {
 501        error_report("Cannot get vdpa backend_features: %s(%d)",
 502            g_strerror(errno), errno);
 503        return -1;
 504    }
 505    if (!(backend_features & BIT_ULL(VHOST_BACKEND_F_IOTLB_ASID)) ||
 506        !vhost_vdpa_net_valid_svq_features(v->dev->features, NULL)) {
 507        return 0;
 508    }
 509
 510    /*
 511     * Check if all the virtqueues of the virtio device are in a different vq
 512     * than the last vq. VQ group of last group passed in cvq_group.
 513     */
 514    cvq_index = v->dev->vq_index_end - 1;
 515    cvq_group = vhost_vdpa_get_vring_group(v->device_fd, cvq_index);
 516    if (unlikely(cvq_group < 0)) {
 517        return cvq_group;
 518    }
 519    for (int i = 0; i < cvq_index; ++i) {
 520        int64_t group = vhost_vdpa_get_vring_group(v->device_fd, i);
 521
 522        if (unlikely(group < 0)) {
 523            return group;
 524        }
 525
 526        if (group == cvq_group) {
 527            return 0;
 528        }
 529    }
 530
 531    r = vhost_vdpa_set_address_space_id(v, cvq_group, VHOST_VDPA_NET_CVQ_ASID);
 532    if (unlikely(r < 0)) {
 533        return r;
 534    }
 535
 536    v->shadow_vqs_enabled = true;
 537    s->vhost_vdpa.address_space_id = VHOST_VDPA_NET_CVQ_ASID;
 538
 539out:
 540    if (!s->vhost_vdpa.shadow_vqs_enabled) {
 541        return 0;
 542    }
 543
 544    if (s0->vhost_vdpa.iova_tree) {
 545        /*
 546         * SVQ is already configured for all virtqueues.  Reuse IOVA tree for
 547         * simplicity, whether CVQ shares ASID with guest or not, because:
 548         * - Memory listener need access to guest's memory addresses allocated
 549         *   in the IOVA tree.
 550         * - There should be plenty of IOVA address space for both ASID not to
 551         *   worry about collisions between them.  Guest's translations are
 552         *   still validated with virtio virtqueue_pop so there is no risk for
 553         *   the guest to access memory that it shouldn't.
 554         *
 555         * To allocate a iova tree per ASID is doable but it complicates the
 556         * code and it is not worth it for the moment.
 557         */
 558        v->iova_tree = s0->vhost_vdpa.iova_tree;
 559    } else {
 560        v->iova_tree = vhost_iova_tree_new(v->iova_range.first,
 561                                           v->iova_range.last);
 562    }
 563
 564    r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer,
 565                               vhost_vdpa_net_cvq_cmd_page_len(), false);
 566    if (unlikely(r < 0)) {
 567        return r;
 568    }
 569
 570    r = vhost_vdpa_cvq_map_buf(&s->vhost_vdpa, s->status,
 571                               vhost_vdpa_net_cvq_cmd_page_len(), true);
 572    if (unlikely(r < 0)) {
 573        vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
 574    }
 575
 576    return r;
 577}
 578
 579static void vhost_vdpa_net_cvq_stop(NetClientState *nc)
 580{
 581    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
 582
 583    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
 584
 585    if (s->vhost_vdpa.shadow_vqs_enabled) {
 586        vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->cvq_cmd_out_buffer);
 587        vhost_vdpa_cvq_unmap_buf(&s->vhost_vdpa, s->status);
 588    }
 589
 590    vhost_vdpa_net_client_stop(nc);
 591}
 592
 593static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s, size_t out_len,
 594                                      size_t in_len)
 595{
 596    /* Buffers for the device */
 597    const struct iovec out = {
 598        .iov_base = s->cvq_cmd_out_buffer,
 599        .iov_len = out_len,
 600    };
 601    const struct iovec in = {
 602        .iov_base = s->status,
 603        .iov_len = sizeof(virtio_net_ctrl_ack),
 604    };
 605    VhostShadowVirtqueue *svq = g_ptr_array_index(s->vhost_vdpa.shadow_vqs, 0);
 606    int r;
 607
 608    r = vhost_svq_add(svq, &out, 1, &in, 1, NULL);
 609    if (unlikely(r != 0)) {
 610        if (unlikely(r == -ENOSPC)) {
 611            qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n",
 612                          __func__);
 613        }
 614        return r;
 615    }
 616
 617    /*
 618     * We can poll here since we've had BQL from the time we sent the
 619     * descriptor. Also, we need to take the answer before SVQ pulls by itself,
 620     * when BQL is released
 621     */
 622    return vhost_svq_poll(svq);
 623}
 624
 625static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s, uint8_t class,
 626                                       uint8_t cmd, const void *data,
 627                                       size_t data_size)
 628{
 629    const struct virtio_net_ctrl_hdr ctrl = {
 630        .class = class,
 631        .cmd = cmd,
 632    };
 633
 634    assert(data_size < vhost_vdpa_net_cvq_cmd_page_len() - sizeof(ctrl));
 635
 636    memcpy(s->cvq_cmd_out_buffer, &ctrl, sizeof(ctrl));
 637    memcpy(s->cvq_cmd_out_buffer + sizeof(ctrl), data, data_size);
 638
 639    return vhost_vdpa_net_cvq_add(s, sizeof(ctrl) + data_size,
 640                                  sizeof(virtio_net_ctrl_ack));
 641}
 642
 643static int vhost_vdpa_net_load_mac(VhostVDPAState *s, const VirtIONet *n)
 644{
 645    uint64_t features = n->parent_obj.guest_features;
 646    if (features & BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR)) {
 647        ssize_t dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_MAC,
 648                                                  VIRTIO_NET_CTRL_MAC_ADDR_SET,
 649                                                  n->mac, sizeof(n->mac));
 650        if (unlikely(dev_written < 0)) {
 651            return dev_written;
 652        }
 653
 654        return *s->status != VIRTIO_NET_OK;
 655    }
 656
 657    return 0;
 658}
 659
 660static int vhost_vdpa_net_load_mq(VhostVDPAState *s,
 661                                  const VirtIONet *n)
 662{
 663    struct virtio_net_ctrl_mq mq;
 664    uint64_t features = n->parent_obj.guest_features;
 665    ssize_t dev_written;
 666
 667    if (!(features & BIT_ULL(VIRTIO_NET_F_MQ))) {
 668        return 0;
 669    }
 670
 671    mq.virtqueue_pairs = cpu_to_le16(n->curr_queue_pairs);
 672    dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_MQ,
 673                                          VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &mq,
 674                                          sizeof(mq));
 675    if (unlikely(dev_written < 0)) {
 676        return dev_written;
 677    }
 678
 679    return *s->status != VIRTIO_NET_OK;
 680}
 681
 682static int vhost_vdpa_net_load(NetClientState *nc)
 683{
 684    VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
 685    struct vhost_vdpa *v = &s->vhost_vdpa;
 686    const VirtIONet *n;
 687    int r;
 688
 689    assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
 690
 691    if (!v->shadow_vqs_enabled) {
 692        return 0;
 693    }
 694
 695    n = VIRTIO_NET(v->dev->vdev);
 696    r = vhost_vdpa_net_load_mac(s, n);
 697    if (unlikely(r < 0)) {
 698        return r;
 699    }
 700    r = vhost_vdpa_net_load_mq(s, n);
 701    if (unlikely(r)) {
 702        return r;
 703    }
 704
 705    return 0;
 706}
 707
 708static NetClientInfo net_vhost_vdpa_cvq_info = {
 709    .type = NET_CLIENT_DRIVER_VHOST_VDPA,
 710    .size = sizeof(VhostVDPAState),
 711    .receive = vhost_vdpa_receive,
 712    .start = vhost_vdpa_net_cvq_start,
 713    .load = vhost_vdpa_net_load,
 714    .stop = vhost_vdpa_net_cvq_stop,
 715    .cleanup = vhost_vdpa_cleanup,
 716    .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
 717    .has_ufo = vhost_vdpa_has_ufo,
 718    .check_peer_type = vhost_vdpa_check_peer_type,
 719};
 720
 721/**
 722 * Validate and copy control virtqueue commands.
 723 *
 724 * Following QEMU guidelines, we offer a copy of the buffers to the device to
 725 * prevent TOCTOU bugs.
 726 */
 727static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
 728                                            VirtQueueElement *elem,
 729                                            void *opaque)
 730{
 731    VhostVDPAState *s = opaque;
 732    size_t in_len;
 733    virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
 734    /* Out buffer sent to both the vdpa device and the device model */
 735    struct iovec out = {
 736        .iov_base = s->cvq_cmd_out_buffer,
 737    };
 738    /* in buffer used for device model */
 739    const struct iovec in = {
 740        .iov_base = &status,
 741        .iov_len = sizeof(status),
 742    };
 743    ssize_t dev_written = -EINVAL;
 744
 745    out.iov_len = iov_to_buf(elem->out_sg, elem->out_num, 0,
 746                             s->cvq_cmd_out_buffer,
 747                             vhost_vdpa_net_cvq_cmd_len());
 748    if (*(uint8_t *)s->cvq_cmd_out_buffer == VIRTIO_NET_CTRL_ANNOUNCE) {
 749        /*
 750         * Guest announce capability is emulated by qemu, so don't forward to
 751         * the device.
 752         */
 753        dev_written = sizeof(status);
 754        *s->status = VIRTIO_NET_OK;
 755    } else {
 756        dev_written = vhost_vdpa_net_cvq_add(s, out.iov_len, sizeof(status));
 757        if (unlikely(dev_written < 0)) {
 758            goto out;
 759        }
 760    }
 761
 762    if (unlikely(dev_written < sizeof(status))) {
 763        error_report("Insufficient written data (%zu)", dev_written);
 764        goto out;
 765    }
 766
 767    if (*s->status != VIRTIO_NET_OK) {
 768        return VIRTIO_NET_ERR;
 769    }
 770
 771    status = VIRTIO_NET_ERR;
 772    virtio_net_handle_ctrl_iov(svq->vdev, &in, 1, &out, 1);
 773    if (status != VIRTIO_NET_OK) {
 774        error_report("Bad CVQ processing in model");
 775    }
 776
 777out:
 778    in_len = iov_from_buf(elem->in_sg, elem->in_num, 0, &status,
 779                          sizeof(status));
 780    if (unlikely(in_len < sizeof(status))) {
 781        error_report("Bad device CVQ written length");
 782    }
 783    vhost_svq_push_elem(svq, elem, MIN(in_len, sizeof(status)));
 784    g_free(elem);
 785    return dev_written < 0 ? dev_written : 0;
 786}
 787
 788static const VhostShadowVirtqueueOps vhost_vdpa_net_svq_ops = {
 789    .avail_handler = vhost_vdpa_net_handle_ctrl_avail,
 790};
 791
 792static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
 793                                       const char *device,
 794                                       const char *name,
 795                                       int vdpa_device_fd,
 796                                       int queue_pair_index,
 797                                       int nvqs,
 798                                       bool is_datapath,
 799                                       bool svq,
 800                                       struct vhost_vdpa_iova_range iova_range,
 801                                       uint64_t features)
 802{
 803    NetClientState *nc = NULL;
 804    VhostVDPAState *s;
 805    int ret = 0;
 806    assert(name);
 807    if (is_datapath) {
 808        nc = qemu_new_net_client(&net_vhost_vdpa_info, peer, device,
 809                                 name);
 810    } else {
 811        nc = qemu_new_net_control_client(&net_vhost_vdpa_cvq_info, peer,
 812                                         device, name);
 813    }
 814    qemu_set_info_str(nc, TYPE_VHOST_VDPA);
 815    s = DO_UPCAST(VhostVDPAState, nc, nc);
 816
 817    s->vhost_vdpa.device_fd = vdpa_device_fd;
 818    s->vhost_vdpa.index = queue_pair_index;
 819    s->always_svq = svq;
 820    s->migration_state.notify = vdpa_net_migration_state_notifier;
 821    s->vhost_vdpa.shadow_vqs_enabled = svq;
 822    s->vhost_vdpa.iova_range = iova_range;
 823    s->vhost_vdpa.shadow_data = svq;
 824    if (queue_pair_index == 0) {
 825        vhost_vdpa_net_valid_svq_features(features,
 826                                          &s->vhost_vdpa.migration_blocker);
 827    } else if (!is_datapath) {
 828        s->cvq_cmd_out_buffer = qemu_memalign(qemu_real_host_page_size(),
 829                                            vhost_vdpa_net_cvq_cmd_page_len());
 830        memset(s->cvq_cmd_out_buffer, 0, vhost_vdpa_net_cvq_cmd_page_len());
 831        s->status = qemu_memalign(qemu_real_host_page_size(),
 832                                  vhost_vdpa_net_cvq_cmd_page_len());
 833        memset(s->status, 0, vhost_vdpa_net_cvq_cmd_page_len());
 834
 835        s->vhost_vdpa.shadow_vq_ops = &vhost_vdpa_net_svq_ops;
 836        s->vhost_vdpa.shadow_vq_ops_opaque = s;
 837
 838        /*
 839         * TODO: We cannot migrate devices with CVQ as there is no way to set
 840         * the device state (MAC, MQ, etc) before starting the datapath.
 841         *
 842         * Migration blocker ownership now belongs to s->vhost_vdpa.
 843         */
 844        error_setg(&s->vhost_vdpa.migration_blocker,
 845                   "net vdpa cannot migrate with CVQ feature");
 846    }
 847    ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs);
 848    if (ret) {
 849        qemu_del_net_client(nc);
 850        return NULL;
 851    }
 852    return nc;
 853}
 854
 855static int vhost_vdpa_get_features(int fd, uint64_t *features, Error **errp)
 856{
 857    int ret = ioctl(fd, VHOST_GET_FEATURES, features);
 858    if (unlikely(ret < 0)) {
 859        error_setg_errno(errp, errno,
 860                         "Fail to query features from vhost-vDPA device");
 861    }
 862    return ret;
 863}
 864
 865static int vhost_vdpa_get_max_queue_pairs(int fd, uint64_t features,
 866                                          int *has_cvq, Error **errp)
 867{
 868    unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
 869    g_autofree struct vhost_vdpa_config *config = NULL;
 870    __virtio16 *max_queue_pairs;
 871    int ret;
 872
 873    if (features & (1 << VIRTIO_NET_F_CTRL_VQ)) {
 874        *has_cvq = 1;
 875    } else {
 876        *has_cvq = 0;
 877    }
 878
 879    if (features & (1 << VIRTIO_NET_F_MQ)) {
 880        config = g_malloc0(config_size + sizeof(*max_queue_pairs));
 881        config->off = offsetof(struct virtio_net_config, max_virtqueue_pairs);
 882        config->len = sizeof(*max_queue_pairs);
 883
 884        ret = ioctl(fd, VHOST_VDPA_GET_CONFIG, config);
 885        if (ret) {
 886            error_setg(errp, "Fail to get config from vhost-vDPA device");
 887            return -ret;
 888        }
 889
 890        max_queue_pairs = (__virtio16 *)&config->buf;
 891
 892        return lduw_le_p(max_queue_pairs);
 893    }
 894
 895    return 1;
 896}
 897
 898int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
 899                        NetClientState *peer, Error **errp)
 900{
 901    const NetdevVhostVDPAOptions *opts;
 902    uint64_t features;
 903    int vdpa_device_fd;
 904    g_autofree NetClientState **ncs = NULL;
 905    struct vhost_vdpa_iova_range iova_range;
 906    NetClientState *nc;
 907    int queue_pairs, r, i = 0, has_cvq = 0;
 908
 909    assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA);
 910    opts = &netdev->u.vhost_vdpa;
 911    if (!opts->vhostdev && !opts->vhostfd) {
 912        error_setg(errp,
 913                   "vhost-vdpa: neither vhostdev= nor vhostfd= was specified");
 914        return -1;
 915    }
 916
 917    if (opts->vhostdev && opts->vhostfd) {
 918        error_setg(errp,
 919                   "vhost-vdpa: vhostdev= and vhostfd= are mutually exclusive");
 920        return -1;
 921    }
 922
 923    if (opts->vhostdev) {
 924        vdpa_device_fd = qemu_open(opts->vhostdev, O_RDWR, errp);
 925        if (vdpa_device_fd == -1) {
 926            return -errno;
 927        }
 928    } else {
 929        /* has_vhostfd */
 930        vdpa_device_fd = monitor_fd_param(monitor_cur(), opts->vhostfd, errp);
 931        if (vdpa_device_fd == -1) {
 932            error_prepend(errp, "vhost-vdpa: unable to parse vhostfd: ");
 933            return -1;
 934        }
 935    }
 936
 937    r = vhost_vdpa_get_features(vdpa_device_fd, &features, errp);
 938    if (unlikely(r < 0)) {
 939        goto err;
 940    }
 941
 942    queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd, features,
 943                                                 &has_cvq, errp);
 944    if (queue_pairs < 0) {
 945        qemu_close(vdpa_device_fd);
 946        return queue_pairs;
 947    }
 948
 949    r = vhost_vdpa_get_iova_range(vdpa_device_fd, &iova_range);
 950    if (unlikely(r < 0)) {
 951        error_setg(errp, "vhost-vdpa: get iova range failed: %s",
 952                   strerror(-r));
 953        goto err;
 954    }
 955
 956    if (opts->x_svq && !vhost_vdpa_net_valid_svq_features(features, errp)) {
 957        goto err;
 958    }
 959
 960    ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
 961
 962    for (i = 0; i < queue_pairs; i++) {
 963        ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
 964                                     vdpa_device_fd, i, 2, true, opts->x_svq,
 965                                     iova_range, features);
 966        if (!ncs[i])
 967            goto err;
 968    }
 969
 970    if (has_cvq) {
 971        nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
 972                                 vdpa_device_fd, i, 1, false,
 973                                 opts->x_svq, iova_range, features);
 974        if (!nc)
 975            goto err;
 976    }
 977
 978    return 0;
 979
 980err:
 981    if (i) {
 982        for (i--; i >= 0; i--) {
 983            qemu_del_net_client(ncs[i]);
 984        }
 985    }
 986
 987    qemu_close(vdpa_device_fd);
 988
 989    return -1;
 990}
 991