dpdk/drivers/net/vhost/rte_eth_vhost.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright(c) 2016 IGEL Co., Ltd.
   3 * Copyright(c) 2016-2018 Intel Corporation
   4 */
   5#include <stdlib.h>
   6#include <unistd.h>
   7#include <pthread.h>
   8#include <stdbool.h>
   9#include <sys/epoll.h>
  10
  11#include <rte_mbuf.h>
  12#include <ethdev_driver.h>
  13#include <ethdev_vdev.h>
  14#include <rte_malloc.h>
  15#include <rte_memcpy.h>
  16#include <rte_net.h>
  17#include <bus_vdev_driver.h>
  18#include <rte_kvargs.h>
  19#include <rte_vhost.h>
  20#include <rte_spinlock.h>
  21
  22#include "rte_eth_vhost.h"
  23
  24RTE_LOG_REGISTER_DEFAULT(vhost_logtype, NOTICE);
  25
  26#define VHOST_LOG(level, ...) \
  27        rte_log(RTE_LOG_ ## level, vhost_logtype, __VA_ARGS__)
  28
  29enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
  30
  31#define ETH_VHOST_IFACE_ARG             "iface"
  32#define ETH_VHOST_QUEUES_ARG            "queues"
  33#define ETH_VHOST_CLIENT_ARG            "client"
  34#define ETH_VHOST_IOMMU_SUPPORT         "iommu-support"
  35#define ETH_VHOST_POSTCOPY_SUPPORT      "postcopy-support"
  36#define ETH_VHOST_VIRTIO_NET_F_HOST_TSO "tso"
  37#define ETH_VHOST_LINEAR_BUF            "linear-buffer"
  38#define ETH_VHOST_EXT_BUF               "ext-buffer"
  39#define ETH_VHOST_LEGACY_OL_FLAGS       "legacy-ol-flags"
  40#define VHOST_MAX_PKT_BURST 32
  41
  42static const char *valid_arguments[] = {
  43        ETH_VHOST_IFACE_ARG,
  44        ETH_VHOST_QUEUES_ARG,
  45        ETH_VHOST_CLIENT_ARG,
  46        ETH_VHOST_IOMMU_SUPPORT,
  47        ETH_VHOST_POSTCOPY_SUPPORT,
  48        ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
  49        ETH_VHOST_LINEAR_BUF,
  50        ETH_VHOST_EXT_BUF,
  51        ETH_VHOST_LEGACY_OL_FLAGS,
  52        NULL
  53};
  54
  55static struct rte_ether_addr base_eth_addr = {
  56        .addr_bytes = {
  57                0x56 /* V */,
  58                0x48 /* H */,
  59                0x4F /* O */,
  60                0x53 /* S */,
  61                0x54 /* T */,
  62                0x00
  63        }
  64};
  65
  66struct vhost_stats {
  67        uint64_t pkts;
  68        uint64_t bytes;
  69        uint64_t missed_pkts;
  70};
  71
  72struct vhost_queue {
  73        int vid;
  74        rte_atomic32_t allow_queuing;
  75        rte_atomic32_t while_queuing;
  76        struct pmd_internal *internal;
  77        struct rte_mempool *mb_pool;
  78        uint16_t port;
  79        uint16_t virtqueue_id;
  80        struct vhost_stats stats;
  81        int intr_enable;
  82        rte_spinlock_t intr_lock;
  83};
  84
  85struct pmd_internal {
  86        rte_atomic32_t dev_attached;
  87        char *iface_name;
  88        uint64_t flags;
  89        uint64_t disable_flags;
  90        uint64_t features;
  91        uint16_t max_queues;
  92        int vid;
  93        rte_atomic32_t started;
  94        bool vlan_strip;
  95        bool rx_sw_csum;
  96        bool tx_sw_csum;
  97};
  98
  99struct internal_list {
 100        TAILQ_ENTRY(internal_list) next;
 101        struct rte_eth_dev *eth_dev;
 102};
 103
 104TAILQ_HEAD(internal_list_head, internal_list);
 105static struct internal_list_head internal_list =
 106        TAILQ_HEAD_INITIALIZER(internal_list);
 107
 108static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
 109
 110static struct rte_eth_link pmd_link = {
 111                .link_speed = 10000,
 112                .link_duplex = RTE_ETH_LINK_FULL_DUPLEX,
 113                .link_status = RTE_ETH_LINK_DOWN
 114};
 115
 116struct rte_vhost_vring_state {
 117        rte_spinlock_t lock;
 118
 119        bool cur[RTE_MAX_QUEUES_PER_PORT * 2];
 120        bool seen[RTE_MAX_QUEUES_PER_PORT * 2];
 121        unsigned int index;
 122        unsigned int max_vring;
 123};
 124
 125static struct rte_vhost_vring_state *vring_states[RTE_MAX_ETHPORTS];
 126
 127static int
 128vhost_dev_xstats_reset(struct rte_eth_dev *dev)
 129{
 130        struct vhost_queue *vq;
 131        int ret, i;
 132
 133        for (i = 0; i < dev->data->nb_rx_queues; i++) {
 134                vq = dev->data->rx_queues[i];
 135                ret = rte_vhost_vring_stats_reset(vq->vid, vq->virtqueue_id);
 136                if (ret < 0)
 137                        return ret;
 138        }
 139
 140        for (i = 0; i < dev->data->nb_tx_queues; i++) {
 141                vq = dev->data->tx_queues[i];
 142                ret = rte_vhost_vring_stats_reset(vq->vid, vq->virtqueue_id);
 143                if (ret < 0)
 144                        return ret;
 145        }
 146
 147        return 0;
 148}
 149
 150static int
 151vhost_dev_xstats_get_names(struct rte_eth_dev *dev,
 152                           struct rte_eth_xstat_name *xstats_names,
 153                           unsigned int limit)
 154{
 155        struct rte_vhost_stat_name *name;
 156        struct vhost_queue *vq;
 157        int ret, i, count = 0, nstats = 0;
 158
 159        for (i = 0; i < dev->data->nb_rx_queues; i++) {
 160                vq = dev->data->rx_queues[i];
 161                ret = rte_vhost_vring_stats_get_names(vq->vid, vq->virtqueue_id, NULL, 0);
 162                if (ret < 0)
 163                        return ret;
 164
 165                nstats += ret;
 166        }
 167
 168        for (i = 0; i < dev->data->nb_tx_queues; i++) {
 169                vq = dev->data->tx_queues[i];
 170                ret = rte_vhost_vring_stats_get_names(vq->vid, vq->virtqueue_id, NULL, 0);
 171                if (ret < 0)
 172                        return ret;
 173
 174                nstats += ret;
 175        }
 176
 177        if (!xstats_names || limit < (unsigned int)nstats)
 178                return nstats;
 179
 180        name = calloc(nstats, sizeof(*name));
 181        if (!name)
 182                return -1;
 183
 184        for (i = 0; i < dev->data->nb_rx_queues; i++) {
 185                vq = dev->data->rx_queues[i];
 186                ret = rte_vhost_vring_stats_get_names(vq->vid, vq->virtqueue_id,
 187                                name + count, nstats - count);
 188                if (ret < 0) {
 189                        free(name);
 190                        return ret;
 191                }
 192
 193                count += ret;
 194        }
 195
 196        for (i = 0; i < dev->data->nb_tx_queues; i++) {
 197                vq = dev->data->tx_queues[i];
 198                ret = rte_vhost_vring_stats_get_names(vq->vid, vq->virtqueue_id,
 199                                name + count, nstats - count);
 200                if (ret < 0) {
 201                        free(name);
 202                        return ret;
 203                }
 204
 205                count += ret;
 206        }
 207
 208        for (i = 0; i < count; i++)
 209                strncpy(xstats_names[i].name, name[i].name, RTE_ETH_XSTATS_NAME_SIZE);
 210
 211        free(name);
 212
 213        return count;
 214}
 215
 216static int
 217vhost_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats,
 218                     unsigned int n)
 219{
 220        struct rte_vhost_stat *stats;
 221        struct vhost_queue *vq;
 222        int ret, i, count = 0, nstats = 0;
 223
 224        for (i = 0; i < dev->data->nb_rx_queues; i++) {
 225                vq = dev->data->rx_queues[i];
 226                ret = rte_vhost_vring_stats_get(vq->vid, vq->virtqueue_id, NULL, 0);
 227                if (ret < 0)
 228                        return ret;
 229
 230                nstats += ret;
 231        }
 232
 233        for (i = 0; i < dev->data->nb_tx_queues; i++) {
 234                vq = dev->data->tx_queues[i];
 235                ret = rte_vhost_vring_stats_get(vq->vid, vq->virtqueue_id, NULL, 0);
 236                if (ret < 0)
 237                        return ret;
 238
 239                nstats += ret;
 240        }
 241
 242        if (!xstats || n < (unsigned int)nstats)
 243                return nstats;
 244
 245        stats = calloc(nstats, sizeof(*stats));
 246        if (!stats)
 247                return -1;
 248
 249        for (i = 0; i < dev->data->nb_rx_queues; i++) {
 250                vq = dev->data->rx_queues[i];
 251                ret = rte_vhost_vring_stats_get(vq->vid, vq->virtqueue_id,
 252                                stats + count, nstats - count);
 253                if (ret < 0) {
 254                        free(stats);
 255                        return ret;
 256                }
 257
 258                count += ret;
 259        }
 260
 261        for (i = 0; i < dev->data->nb_tx_queues; i++) {
 262                vq = dev->data->tx_queues[i];
 263                ret = rte_vhost_vring_stats_get(vq->vid, vq->virtqueue_id,
 264                                stats + count, nstats - count);
 265                if (ret < 0) {
 266                        free(stats);
 267                        return ret;
 268                }
 269
 270                count += ret;
 271        }
 272
 273        for (i = 0; i < count; i++) {
 274                xstats[i].id = stats[i].id;
 275                xstats[i].value = stats[i].value;
 276        }
 277
 278        free(stats);
 279
 280        return nstats;
 281}
 282
 283static void
 284vhost_dev_csum_configure(struct rte_eth_dev *eth_dev)
 285{
 286        struct pmd_internal *internal = eth_dev->data->dev_private;
 287        const struct rte_eth_rxmode *rxmode = &eth_dev->data->dev_conf.rxmode;
 288        const struct rte_eth_txmode *txmode = &eth_dev->data->dev_conf.txmode;
 289
 290        internal->rx_sw_csum = false;
 291        internal->tx_sw_csum = false;
 292
 293        /* SW checksum is not compatible with legacy mode */
 294        if (!(internal->flags & RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS))
 295                return;
 296
 297        if (internal->features & (1ULL << VIRTIO_NET_F_CSUM)) {
 298                if (!(rxmode->offloads &
 299                                (RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_TCP_CKSUM))) {
 300                        VHOST_LOG(NOTICE, "Rx csum will be done in SW, may impact performance.");
 301                        internal->rx_sw_csum = true;
 302                }
 303        }
 304
 305        if (!(internal->features & (1ULL << VIRTIO_NET_F_GUEST_CSUM))) {
 306                if (txmode->offloads &
 307                                (RTE_ETH_TX_OFFLOAD_UDP_CKSUM | RTE_ETH_TX_OFFLOAD_TCP_CKSUM)) {
 308                        VHOST_LOG(NOTICE, "Tx csum will be done in SW, may impact performance.");
 309                        internal->tx_sw_csum = true;
 310                }
 311        }
 312}
 313
 314static void
 315vhost_dev_tx_sw_csum(struct rte_mbuf *mbuf)
 316{
 317        uint32_t hdr_len;
 318        uint16_t csum = 0, csum_offset;
 319
 320        switch (mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK) {
 321        case RTE_MBUF_F_TX_L4_NO_CKSUM:
 322                return;
 323        case RTE_MBUF_F_TX_TCP_CKSUM:
 324                csum_offset = offsetof(struct rte_tcp_hdr, cksum);
 325                break;
 326        case RTE_MBUF_F_TX_UDP_CKSUM:
 327                csum_offset = offsetof(struct rte_udp_hdr, dgram_cksum);
 328                break;
 329        default:
 330                /* Unsupported packet type. */
 331                return;
 332        }
 333
 334        hdr_len = mbuf->l2_len + mbuf->l3_len;
 335        csum_offset += hdr_len;
 336
 337        /* Prepare the pseudo-header checksum */
 338        if (rte_net_intel_cksum_prepare(mbuf) < 0)
 339                return;
 340
 341        if (rte_raw_cksum_mbuf(mbuf, hdr_len, rte_pktmbuf_pkt_len(mbuf) - hdr_len, &csum) < 0)
 342                return;
 343
 344        csum = ~csum;
 345        /* See RFC768 */
 346        if (unlikely((mbuf->packet_type & RTE_PTYPE_L4_UDP) && csum == 0))
 347                csum = 0xffff;
 348
 349        if (rte_pktmbuf_data_len(mbuf) >= csum_offset + 1)
 350                *rte_pktmbuf_mtod_offset(mbuf, uint16_t *, csum_offset) = csum;
 351
 352        mbuf->ol_flags &= ~RTE_MBUF_F_TX_L4_MASK;
 353        mbuf->ol_flags |= RTE_MBUF_F_TX_L4_NO_CKSUM;
 354}
 355
 356static void
 357vhost_dev_rx_sw_csum(struct rte_mbuf *mbuf)
 358{
 359        struct rte_net_hdr_lens hdr_lens;
 360        uint32_t ptype, hdr_len;
 361        uint16_t csum = 0, csum_offset;
 362
 363        /* Return early if the L4 checksum was not offloaded */
 364        if ((mbuf->ol_flags & RTE_MBUF_F_RX_L4_CKSUM_MASK) != RTE_MBUF_F_RX_L4_CKSUM_NONE)
 365                return;
 366
 367        ptype = rte_net_get_ptype(mbuf, &hdr_lens, RTE_PTYPE_ALL_MASK);
 368
 369        hdr_len = hdr_lens.l2_len + hdr_lens.l3_len;
 370
 371        switch (ptype & RTE_PTYPE_L4_MASK) {
 372        case RTE_PTYPE_L4_TCP:
 373                csum_offset = offsetof(struct rte_tcp_hdr, cksum) + hdr_len;
 374                break;
 375        case RTE_PTYPE_L4_UDP:
 376                csum_offset = offsetof(struct rte_udp_hdr, dgram_cksum) + hdr_len;
 377                break;
 378        default:
 379                /* Unsupported packet type */
 380                return;
 381        }
 382
 383        /* The pseudo-header checksum is already performed, as per Virtio spec */
 384        if (rte_raw_cksum_mbuf(mbuf, hdr_len, rte_pktmbuf_pkt_len(mbuf) - hdr_len, &csum) < 0)
 385                return;
 386
 387        csum = ~csum;
 388        /* See RFC768 */
 389        if (unlikely((ptype & RTE_PTYPE_L4_UDP) && csum == 0))
 390                csum = 0xffff;
 391
 392        if (rte_pktmbuf_data_len(mbuf) >= csum_offset + 1)
 393                *rte_pktmbuf_mtod_offset(mbuf, uint16_t *, csum_offset) = csum;
 394
 395        mbuf->ol_flags &= ~RTE_MBUF_F_RX_L4_CKSUM_MASK;
 396        mbuf->ol_flags |= RTE_MBUF_F_RX_L4_CKSUM_GOOD;
 397}
 398
 399static uint16_t
 400eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
 401{
 402        struct vhost_queue *r = q;
 403        uint16_t i, nb_rx = 0;
 404        uint16_t nb_receive = nb_bufs;
 405
 406        if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
 407                return 0;
 408
 409        rte_atomic32_set(&r->while_queuing, 1);
 410
 411        if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
 412                goto out;
 413
 414        /* Dequeue packets from guest TX queue */
 415        while (nb_receive) {
 416                uint16_t nb_pkts;
 417                uint16_t num = (uint16_t)RTE_MIN(nb_receive,
 418                                                 VHOST_MAX_PKT_BURST);
 419
 420                nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
 421                                                  r->mb_pool, &bufs[nb_rx],
 422                                                  num);
 423
 424                nb_rx += nb_pkts;
 425                nb_receive -= nb_pkts;
 426                if (nb_pkts < num)
 427                        break;
 428        }
 429
 430        r->stats.pkts += nb_rx;
 431
 432        for (i = 0; likely(i < nb_rx); i++) {
 433                bufs[i]->port = r->port;
 434                bufs[i]->vlan_tci = 0;
 435
 436                if (r->internal->vlan_strip)
 437                        rte_vlan_strip(bufs[i]);
 438
 439                if (r->internal->rx_sw_csum)
 440                        vhost_dev_rx_sw_csum(bufs[i]);
 441
 442                r->stats.bytes += bufs[i]->pkt_len;
 443        }
 444
 445out:
 446        rte_atomic32_set(&r->while_queuing, 0);
 447
 448        return nb_rx;
 449}
 450
 451static uint16_t
 452eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
 453{
 454        struct vhost_queue *r = q;
 455        uint16_t i, nb_tx = 0;
 456        uint16_t nb_send = 0;
 457        uint64_t nb_bytes = 0;
 458        uint64_t nb_missed = 0;
 459
 460        if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
 461                return 0;
 462
 463        rte_atomic32_set(&r->while_queuing, 1);
 464
 465        if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
 466                goto out;
 467
 468        for (i = 0; i < nb_bufs; i++) {
 469                struct rte_mbuf *m = bufs[i];
 470
 471                /* Do VLAN tag insertion */
 472                if (m->ol_flags & RTE_MBUF_F_TX_VLAN) {
 473                        int error = rte_vlan_insert(&m);
 474                        if (unlikely(error)) {
 475                                rte_pktmbuf_free(m);
 476                                continue;
 477                        }
 478                }
 479
 480                if (r->internal->tx_sw_csum)
 481                        vhost_dev_tx_sw_csum(m);
 482
 483
 484                bufs[nb_send] = m;
 485                ++nb_send;
 486        }
 487
 488        /* Enqueue packets to guest RX queue */
 489        while (nb_send) {
 490                uint16_t nb_pkts;
 491                uint16_t num = (uint16_t)RTE_MIN(nb_send,
 492                                                 VHOST_MAX_PKT_BURST);
 493
 494                nb_pkts = rte_vhost_enqueue_burst(r->vid, r->virtqueue_id,
 495                                                  &bufs[nb_tx], num);
 496
 497                nb_tx += nb_pkts;
 498                nb_send -= nb_pkts;
 499                if (nb_pkts < num)
 500                        break;
 501        }
 502
 503        for (i = 0; likely(i < nb_tx); i++)
 504                nb_bytes += bufs[i]->pkt_len;
 505
 506        nb_missed = nb_bufs - nb_tx;
 507
 508        r->stats.pkts += nb_tx;
 509        r->stats.bytes += nb_bytes;
 510        r->stats.missed_pkts += nb_missed;
 511
 512        for (i = 0; likely(i < nb_tx); i++)
 513                rte_pktmbuf_free(bufs[i]);
 514out:
 515        rte_atomic32_set(&r->while_queuing, 0);
 516
 517        return nb_tx;
 518}
 519
 520static inline struct internal_list *
 521find_internal_resource(char *ifname)
 522{
 523        int found = 0;
 524        struct internal_list *list;
 525        struct pmd_internal *internal;
 526
 527        if (ifname == NULL)
 528                return NULL;
 529
 530        pthread_mutex_lock(&internal_list_lock);
 531
 532        TAILQ_FOREACH(list, &internal_list, next) {
 533                internal = list->eth_dev->data->dev_private;
 534                if (!strcmp(internal->iface_name, ifname)) {
 535                        found = 1;
 536                        break;
 537                }
 538        }
 539
 540        pthread_mutex_unlock(&internal_list_lock);
 541
 542        if (!found)
 543                return NULL;
 544
 545        return list;
 546}
 547
 548static int
 549eth_vhost_update_intr(struct rte_eth_dev *eth_dev, uint16_t rxq_idx)
 550{
 551        struct rte_intr_handle *handle = eth_dev->intr_handle;
 552        struct rte_epoll_event rev, *elist;
 553        int epfd, ret;
 554
 555        if (handle == NULL)
 556                return 0;
 557
 558        elist = rte_intr_elist_index_get(handle, rxq_idx);
 559        if (rte_intr_efds_index_get(handle, rxq_idx) == elist->fd)
 560                return 0;
 561
 562        VHOST_LOG(INFO, "kickfd for rxq-%d was changed, updating handler.\n",
 563                        rxq_idx);
 564
 565        if (elist->fd != -1)
 566                VHOST_LOG(ERR, "Unexpected previous kickfd value (Got %d, expected -1).\n",
 567                        elist->fd);
 568
 569        /*
 570         * First remove invalid epoll event, and then install
 571         * the new one. May be solved with a proper API in the
 572         * future.
 573         */
 574        epfd = elist->epfd;
 575        rev = *elist;
 576        ret = rte_epoll_ctl(epfd, EPOLL_CTL_DEL, rev.fd,
 577                        elist);
 578        if (ret) {
 579                VHOST_LOG(ERR, "Delete epoll event failed.\n");
 580                return ret;
 581        }
 582
 583        rev.fd = rte_intr_efds_index_get(handle, rxq_idx);
 584        if (rte_intr_elist_index_set(handle, rxq_idx, rev))
 585                return -rte_errno;
 586
 587        elist = rte_intr_elist_index_get(handle, rxq_idx);
 588        ret = rte_epoll_ctl(epfd, EPOLL_CTL_ADD, rev.fd, elist);
 589        if (ret) {
 590                VHOST_LOG(ERR, "Add epoll event failed.\n");
 591                return ret;
 592        }
 593
 594        return 0;
 595}
 596
 597static int
 598eth_rxq_intr_enable(struct rte_eth_dev *dev, uint16_t qid)
 599{
 600        struct rte_vhost_vring vring;
 601        struct vhost_queue *vq;
 602        int old_intr_enable, ret = 0;
 603
 604        vq = dev->data->rx_queues[qid];
 605        if (!vq) {
 606                VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
 607                return -1;
 608        }
 609
 610        rte_spinlock_lock(&vq->intr_lock);
 611        old_intr_enable = vq->intr_enable;
 612        vq->intr_enable = 1;
 613        ret = eth_vhost_update_intr(dev, qid);
 614        rte_spinlock_unlock(&vq->intr_lock);
 615
 616        if (ret < 0) {
 617                VHOST_LOG(ERR, "Failed to update rxq%d's intr\n", qid);
 618                vq->intr_enable = old_intr_enable;
 619                return ret;
 620        }
 621
 622        ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
 623        if (ret < 0) {
 624                VHOST_LOG(ERR, "Failed to get rxq%d's vring\n", qid);
 625                return ret;
 626        }
 627        VHOST_LOG(INFO, "Enable interrupt for rxq%d\n", qid);
 628        rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 1);
 629        rte_wmb();
 630
 631        return ret;
 632}
 633
 634static int
 635eth_rxq_intr_disable(struct rte_eth_dev *dev, uint16_t qid)
 636{
 637        struct rte_vhost_vring vring;
 638        struct vhost_queue *vq;
 639        int ret = 0;
 640
 641        vq = dev->data->rx_queues[qid];
 642        if (!vq) {
 643                VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
 644                return -1;
 645        }
 646
 647        ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
 648        if (ret < 0) {
 649                VHOST_LOG(ERR, "Failed to get rxq%d's vring", qid);
 650                return ret;
 651        }
 652        VHOST_LOG(INFO, "Disable interrupt for rxq%d\n", qid);
 653        rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 0);
 654        rte_wmb();
 655
 656        vq->intr_enable = 0;
 657
 658        return 0;
 659}
 660
 661static void
 662eth_vhost_uninstall_intr(struct rte_eth_dev *dev)
 663{
 664        struct rte_intr_handle *intr_handle = dev->intr_handle;
 665
 666        if (intr_handle != NULL) {
 667                rte_intr_vec_list_free(intr_handle);
 668                rte_intr_instance_free(intr_handle);
 669        }
 670        dev->intr_handle = NULL;
 671}
 672
 673static int
 674eth_vhost_install_intr(struct rte_eth_dev *dev)
 675{
 676        struct rte_vhost_vring vring;
 677        struct vhost_queue *vq;
 678        int nb_rxq = dev->data->nb_rx_queues;
 679        int i;
 680        int ret;
 681
 682        /* uninstall firstly if we are reconnecting */
 683        if (dev->intr_handle != NULL)
 684                eth_vhost_uninstall_intr(dev);
 685
 686        dev->intr_handle = rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_PRIVATE);
 687        if (dev->intr_handle == NULL) {
 688                VHOST_LOG(ERR, "Fail to allocate intr_handle\n");
 689                return -ENOMEM;
 690        }
 691        if (rte_intr_efd_counter_size_set(dev->intr_handle, sizeof(uint64_t)))
 692                return -rte_errno;
 693
 694        if (rte_intr_vec_list_alloc(dev->intr_handle, NULL, nb_rxq)) {
 695                VHOST_LOG(ERR,
 696                        "Failed to allocate memory for interrupt vector\n");
 697                rte_intr_instance_free(dev->intr_handle);
 698                return -ENOMEM;
 699        }
 700
 701
 702        VHOST_LOG(INFO, "Prepare intr vec\n");
 703        for (i = 0; i < nb_rxq; i++) {
 704                if (rte_intr_vec_list_index_set(dev->intr_handle, i, RTE_INTR_VEC_RXTX_OFFSET + i))
 705                        return -rte_errno;
 706                if (rte_intr_efds_index_set(dev->intr_handle, i, -1))
 707                        return -rte_errno;
 708                vq = dev->data->rx_queues[i];
 709                if (!vq) {
 710                        VHOST_LOG(INFO, "rxq-%d not setup yet, skip!\n", i);
 711                        continue;
 712                }
 713
 714                ret = rte_vhost_get_vhost_vring(vq->vid, (i << 1) + 1, &vring);
 715                if (ret < 0) {
 716                        VHOST_LOG(INFO,
 717                                "Failed to get rxq-%d's vring, skip!\n", i);
 718                        continue;
 719                }
 720
 721                if (vring.kickfd < 0) {
 722                        VHOST_LOG(INFO,
 723                                "rxq-%d's kickfd is invalid, skip!\n", i);
 724                        continue;
 725                }
 726
 727                if (rte_intr_efds_index_set(dev->intr_handle, i, vring.kickfd))
 728                        continue;
 729                VHOST_LOG(INFO, "Installed intr vec for rxq-%d\n", i);
 730        }
 731
 732        if (rte_intr_nb_efd_set(dev->intr_handle, nb_rxq))
 733                return -rte_errno;
 734
 735        if (rte_intr_max_intr_set(dev->intr_handle, nb_rxq + 1))
 736                return -rte_errno;
 737
 738        if (rte_intr_type_set(dev->intr_handle, RTE_INTR_HANDLE_VDEV))
 739                return -rte_errno;
 740
 741        return 0;
 742}
 743
 744static void
 745update_queuing_status(struct rte_eth_dev *dev, bool wait_queuing)
 746{
 747        struct pmd_internal *internal = dev->data->dev_private;
 748        struct vhost_queue *vq;
 749        struct rte_vhost_vring_state *state;
 750        unsigned int i;
 751        int allow_queuing = 1;
 752
 753        if (!dev->data->rx_queues || !dev->data->tx_queues)
 754                return;
 755
 756        if (rte_atomic32_read(&internal->started) == 0 ||
 757            rte_atomic32_read(&internal->dev_attached) == 0)
 758                allow_queuing = 0;
 759
 760        state = vring_states[dev->data->port_id];
 761
 762        /* Wait until rx/tx_pkt_burst stops accessing vhost device */
 763        for (i = 0; i < dev->data->nb_rx_queues; i++) {
 764                vq = dev->data->rx_queues[i];
 765                if (vq == NULL)
 766                        continue;
 767                if (allow_queuing && state->cur[vq->virtqueue_id])
 768                        rte_atomic32_set(&vq->allow_queuing, 1);
 769                else
 770                        rte_atomic32_set(&vq->allow_queuing, 0);
 771                while (wait_queuing && rte_atomic32_read(&vq->while_queuing))
 772                        rte_pause();
 773        }
 774
 775        for (i = 0; i < dev->data->nb_tx_queues; i++) {
 776                vq = dev->data->tx_queues[i];
 777                if (vq == NULL)
 778                        continue;
 779                if (allow_queuing && state->cur[vq->virtqueue_id])
 780                        rte_atomic32_set(&vq->allow_queuing, 1);
 781                else
 782                        rte_atomic32_set(&vq->allow_queuing, 0);
 783                while (wait_queuing && rte_atomic32_read(&vq->while_queuing))
 784                        rte_pause();
 785        }
 786}
 787
 788static void
 789queue_setup(struct rte_eth_dev *eth_dev, struct pmd_internal *internal)
 790{
 791        struct vhost_queue *vq;
 792        int i;
 793
 794        for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
 795                vq = eth_dev->data->rx_queues[i];
 796                if (!vq)
 797                        continue;
 798                vq->vid = internal->vid;
 799                vq->internal = internal;
 800                vq->port = eth_dev->data->port_id;
 801        }
 802        for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
 803                vq = eth_dev->data->tx_queues[i];
 804                if (!vq)
 805                        continue;
 806                vq->vid = internal->vid;
 807                vq->internal = internal;
 808                vq->port = eth_dev->data->port_id;
 809        }
 810}
 811
 812static int
 813new_device(int vid)
 814{
 815        struct rte_eth_dev *eth_dev;
 816        struct internal_list *list;
 817        struct pmd_internal *internal;
 818        struct rte_eth_conf *dev_conf;
 819        unsigned i;
 820        char ifname[PATH_MAX];
 821#ifdef RTE_LIBRTE_VHOST_NUMA
 822        int newnode;
 823#endif
 824
 825        rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
 826        list = find_internal_resource(ifname);
 827        if (list == NULL) {
 828                VHOST_LOG(INFO, "Invalid device name: %s\n", ifname);
 829                return -1;
 830        }
 831
 832        eth_dev = list->eth_dev;
 833        internal = eth_dev->data->dev_private;
 834        dev_conf = &eth_dev->data->dev_conf;
 835
 836#ifdef RTE_LIBRTE_VHOST_NUMA
 837        newnode = rte_vhost_get_numa_node(vid);
 838        if (newnode >= 0)
 839                eth_dev->data->numa_node = newnode;
 840#endif
 841
 842        if (rte_vhost_get_negotiated_features(vid, &internal->features)) {
 843                VHOST_LOG(ERR, "Failed to get device features\n");
 844                return -1;
 845        }
 846
 847        internal->vid = vid;
 848        if (rte_atomic32_read(&internal->started) == 1) {
 849                queue_setup(eth_dev, internal);
 850
 851                if (dev_conf->intr_conf.rxq) {
 852                        if (eth_vhost_install_intr(eth_dev) < 0) {
 853                                VHOST_LOG(INFO,
 854                                        "Failed to install interrupt handler.");
 855                                        return -1;
 856                        }
 857                }
 858        } else {
 859                VHOST_LOG(INFO, "RX/TX queues not exist yet\n");
 860        }
 861
 862        for (i = 0; i < rte_vhost_get_vring_num(vid); i++)
 863                rte_vhost_enable_guest_notification(vid, i, 0);
 864
 865        rte_vhost_get_mtu(vid, &eth_dev->data->mtu);
 866
 867        eth_dev->data->dev_link.link_status = RTE_ETH_LINK_UP;
 868
 869        vhost_dev_csum_configure(eth_dev);
 870
 871        rte_atomic32_set(&internal->dev_attached, 1);
 872        update_queuing_status(eth_dev, false);
 873
 874        VHOST_LOG(INFO, "Vhost device %d created\n", vid);
 875
 876        rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
 877
 878        return 0;
 879}
 880
 881static void
 882destroy_device(int vid)
 883{
 884        struct rte_eth_dev *eth_dev;
 885        struct pmd_internal *internal;
 886        struct vhost_queue *vq;
 887        struct internal_list *list;
 888        char ifname[PATH_MAX];
 889        unsigned i;
 890        struct rte_vhost_vring_state *state;
 891
 892        rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
 893        list = find_internal_resource(ifname);
 894        if (list == NULL) {
 895                VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
 896                return;
 897        }
 898        eth_dev = list->eth_dev;
 899        internal = eth_dev->data->dev_private;
 900
 901        rte_atomic32_set(&internal->dev_attached, 0);
 902        update_queuing_status(eth_dev, true);
 903
 904        eth_dev->data->dev_link.link_status = RTE_ETH_LINK_DOWN;
 905
 906        if (eth_dev->data->rx_queues && eth_dev->data->tx_queues) {
 907                for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
 908                        vq = eth_dev->data->rx_queues[i];
 909                        if (!vq)
 910                                continue;
 911                        vq->vid = -1;
 912                }
 913                for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
 914                        vq = eth_dev->data->tx_queues[i];
 915                        if (!vq)
 916                                continue;
 917                        vq->vid = -1;
 918                }
 919        }
 920
 921        state = vring_states[eth_dev->data->port_id];
 922        rte_spinlock_lock(&state->lock);
 923        for (i = 0; i <= state->max_vring; i++) {
 924                state->cur[i] = false;
 925                state->seen[i] = false;
 926        }
 927        state->max_vring = 0;
 928        rte_spinlock_unlock(&state->lock);
 929
 930        VHOST_LOG(INFO, "Vhost device %d destroyed\n", vid);
 931        eth_vhost_uninstall_intr(eth_dev);
 932
 933        rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
 934}
 935
 936static int
 937vring_conf_update(int vid, struct rte_eth_dev *eth_dev, uint16_t vring_id)
 938{
 939        struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
 940        struct pmd_internal *internal = eth_dev->data->dev_private;
 941        struct vhost_queue *vq;
 942        struct rte_vhost_vring vring;
 943        int rx_idx = vring_id % 2 ? (vring_id - 1) >> 1 : -1;
 944        int ret = 0;
 945
 946        /*
 947         * The vring kickfd may be changed after the new device notification.
 948         * Update it when the vring state is updated.
 949         */
 950        if (rx_idx >= 0 && rx_idx < eth_dev->data->nb_rx_queues &&
 951            rte_atomic32_read(&internal->dev_attached) &&
 952            rte_atomic32_read(&internal->started) &&
 953            dev_conf->intr_conf.rxq) {
 954                ret = rte_vhost_get_vhost_vring(vid, vring_id, &vring);
 955                if (ret) {
 956                        VHOST_LOG(ERR, "Failed to get vring %d information.\n",
 957                                        vring_id);
 958                        return ret;
 959                }
 960
 961                if (rte_intr_efds_index_set(eth_dev->intr_handle, rx_idx,
 962                                                   vring.kickfd))
 963                        return -rte_errno;
 964
 965                vq = eth_dev->data->rx_queues[rx_idx];
 966                if (!vq) {
 967                        VHOST_LOG(ERR, "rxq%d is not setup yet\n", rx_idx);
 968                        return -1;
 969                }
 970
 971                rte_spinlock_lock(&vq->intr_lock);
 972                if (vq->intr_enable)
 973                        ret = eth_vhost_update_intr(eth_dev, rx_idx);
 974                rte_spinlock_unlock(&vq->intr_lock);
 975        }
 976
 977        return ret;
 978}
 979
 980static int
 981vring_state_changed(int vid, uint16_t vring, int enable)
 982{
 983        struct rte_vhost_vring_state *state;
 984        struct rte_eth_dev *eth_dev;
 985        struct internal_list *list;
 986        char ifname[PATH_MAX];
 987
 988        rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
 989        list = find_internal_resource(ifname);
 990        if (list == NULL) {
 991                VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
 992                return -1;
 993        }
 994
 995        eth_dev = list->eth_dev;
 996        /* won't be NULL */
 997        state = vring_states[eth_dev->data->port_id];
 998
 999        if (enable && vring_conf_update(vid, eth_dev, vring))
1000                VHOST_LOG(INFO, "Failed to update vring-%d configuration.\n",
1001                          (int)vring);
1002
1003        rte_spinlock_lock(&state->lock);
1004        if (state->cur[vring] == enable) {
1005                rte_spinlock_unlock(&state->lock);
1006                return 0;
1007        }
1008        state->cur[vring] = enable;
1009        state->max_vring = RTE_MAX(vring, state->max_vring);
1010        rte_spinlock_unlock(&state->lock);
1011
1012        update_queuing_status(eth_dev, false);
1013
1014        VHOST_LOG(INFO, "vring%u is %s\n",
1015                        vring, enable ? "enabled" : "disabled");
1016
1017        rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_QUEUE_STATE, NULL);
1018
1019        return 0;
1020}
1021
1022static struct rte_vhost_device_ops vhost_ops = {
1023        .new_device          = new_device,
1024        .destroy_device      = destroy_device,
1025        .vring_state_changed = vring_state_changed,
1026};
1027
1028static int
1029vhost_driver_setup(struct rte_eth_dev *eth_dev)
1030{
1031        struct pmd_internal *internal = eth_dev->data->dev_private;
1032        struct internal_list *list = NULL;
1033        struct rte_vhost_vring_state *vring_state = NULL;
1034        unsigned int numa_node = eth_dev->device->numa_node;
1035        const char *name = eth_dev->device->name;
1036
1037        /* Don't try to setup again if it has already been done. */
1038        list = find_internal_resource(internal->iface_name);
1039        if (list)
1040                return 0;
1041
1042        list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node);
1043        if (list == NULL)
1044                return -1;
1045
1046        vring_state = rte_zmalloc_socket(name, sizeof(*vring_state),
1047                                         0, numa_node);
1048        if (vring_state == NULL)
1049                goto free_list;
1050
1051        list->eth_dev = eth_dev;
1052        pthread_mutex_lock(&internal_list_lock);
1053        TAILQ_INSERT_TAIL(&internal_list, list, next);
1054        pthread_mutex_unlock(&internal_list_lock);
1055
1056        rte_spinlock_init(&vring_state->lock);
1057        vring_states[eth_dev->data->port_id] = vring_state;
1058
1059        if (rte_vhost_driver_register(internal->iface_name, internal->flags))
1060                goto list_remove;
1061
1062        if (internal->disable_flags) {
1063                if (rte_vhost_driver_disable_features(internal->iface_name,
1064                                                      internal->disable_flags))
1065                        goto drv_unreg;
1066        }
1067
1068        if (rte_vhost_driver_callback_register(internal->iface_name,
1069                                               &vhost_ops) < 0) {
1070                VHOST_LOG(ERR, "Can't register callbacks\n");
1071                goto drv_unreg;
1072        }
1073
1074        if (rte_vhost_driver_start(internal->iface_name) < 0) {
1075                VHOST_LOG(ERR, "Failed to start driver for %s\n",
1076                          internal->iface_name);
1077                goto drv_unreg;
1078        }
1079
1080        return 0;
1081
1082drv_unreg:
1083        rte_vhost_driver_unregister(internal->iface_name);
1084list_remove:
1085        vring_states[eth_dev->data->port_id] = NULL;
1086        pthread_mutex_lock(&internal_list_lock);
1087        TAILQ_REMOVE(&internal_list, list, next);
1088        pthread_mutex_unlock(&internal_list_lock);
1089        rte_free(vring_state);
1090free_list:
1091        rte_free(list);
1092
1093        return -1;
1094}
1095
1096int
1097rte_eth_vhost_get_queue_event(uint16_t port_id,
1098                struct rte_eth_vhost_queue_event *event)
1099{
1100        struct rte_vhost_vring_state *state;
1101        unsigned int i;
1102        int idx;
1103
1104        if (port_id >= RTE_MAX_ETHPORTS) {
1105                VHOST_LOG(ERR, "Invalid port id\n");
1106                return -1;
1107        }
1108
1109        state = vring_states[port_id];
1110        if (!state) {
1111                VHOST_LOG(ERR, "Unused port\n");
1112                return -1;
1113        }
1114
1115        rte_spinlock_lock(&state->lock);
1116        for (i = 0; i <= state->max_vring; i++) {
1117                idx = state->index++ % (state->max_vring + 1);
1118
1119                if (state->cur[idx] != state->seen[idx]) {
1120                        state->seen[idx] = state->cur[idx];
1121                        event->queue_id = idx / 2;
1122                        event->rx = idx & 1;
1123                        event->enable = state->cur[idx];
1124                        rte_spinlock_unlock(&state->lock);
1125                        return 0;
1126                }
1127        }
1128        rte_spinlock_unlock(&state->lock);
1129
1130        return -1;
1131}
1132
1133int
1134rte_eth_vhost_get_vid_from_port_id(uint16_t port_id)
1135{
1136        struct internal_list *list;
1137        struct rte_eth_dev *eth_dev;
1138        struct vhost_queue *vq;
1139        int vid = -1;
1140
1141        if (!rte_eth_dev_is_valid_port(port_id))
1142                return -1;
1143
1144        pthread_mutex_lock(&internal_list_lock);
1145
1146        TAILQ_FOREACH(list, &internal_list, next) {
1147                eth_dev = list->eth_dev;
1148                if (eth_dev->data->port_id == port_id) {
1149                        vq = eth_dev->data->rx_queues[0];
1150                        if (vq) {
1151                                vid = vq->vid;
1152                        }
1153                        break;
1154                }
1155        }
1156
1157        pthread_mutex_unlock(&internal_list_lock);
1158
1159        return vid;
1160}
1161
1162static int
1163eth_dev_configure(struct rte_eth_dev *dev)
1164{
1165        struct pmd_internal *internal = dev->data->dev_private;
1166        const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
1167
1168        /* NOTE: the same process has to operate a vhost interface
1169         * from beginning to end (from eth_dev configure to eth_dev close).
1170         * It is user's responsibility at the moment.
1171         */
1172        if (vhost_driver_setup(dev) < 0)
1173                return -1;
1174
1175        internal->vlan_strip = !!(rxmode->offloads & RTE_ETH_RX_OFFLOAD_VLAN_STRIP);
1176
1177        vhost_dev_csum_configure(dev);
1178
1179        return 0;
1180}
1181
1182static int
1183eth_dev_start(struct rte_eth_dev *eth_dev)
1184{
1185        struct pmd_internal *internal = eth_dev->data->dev_private;
1186        struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
1187
1188        queue_setup(eth_dev, internal);
1189
1190        if (rte_atomic32_read(&internal->dev_attached) == 1) {
1191                if (dev_conf->intr_conf.rxq) {
1192                        if (eth_vhost_install_intr(eth_dev) < 0) {
1193                                VHOST_LOG(INFO,
1194                                        "Failed to install interrupt handler.");
1195                                        return -1;
1196                        }
1197                }
1198        }
1199
1200        rte_atomic32_set(&internal->started, 1);
1201        update_queuing_status(eth_dev, false);
1202
1203        return 0;
1204}
1205
1206static int
1207eth_dev_stop(struct rte_eth_dev *dev)
1208{
1209        struct pmd_internal *internal = dev->data->dev_private;
1210
1211        dev->data->dev_started = 0;
1212        rte_atomic32_set(&internal->started, 0);
1213        update_queuing_status(dev, true);
1214
1215        return 0;
1216}
1217
1218static int
1219eth_dev_close(struct rte_eth_dev *dev)
1220{
1221        struct pmd_internal *internal;
1222        struct internal_list *list;
1223        unsigned int i, ret;
1224
1225        if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1226                return 0;
1227
1228        internal = dev->data->dev_private;
1229        if (!internal)
1230                return 0;
1231
1232        ret = eth_dev_stop(dev);
1233
1234        list = find_internal_resource(internal->iface_name);
1235        if (list) {
1236                rte_vhost_driver_unregister(internal->iface_name);
1237                pthread_mutex_lock(&internal_list_lock);
1238                TAILQ_REMOVE(&internal_list, list, next);
1239                pthread_mutex_unlock(&internal_list_lock);
1240                rte_free(list);
1241        }
1242
1243        if (dev->data->rx_queues)
1244                for (i = 0; i < dev->data->nb_rx_queues; i++)
1245                        rte_free(dev->data->rx_queues[i]);
1246
1247        if (dev->data->tx_queues)
1248                for (i = 0; i < dev->data->nb_tx_queues; i++)
1249                        rte_free(dev->data->tx_queues[i]);
1250
1251        rte_free(internal->iface_name);
1252        rte_free(internal);
1253
1254        dev->data->dev_private = NULL;
1255
1256        rte_free(vring_states[dev->data->port_id]);
1257        vring_states[dev->data->port_id] = NULL;
1258
1259        return ret;
1260}
1261
1262static int
1263eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1264                   uint16_t nb_rx_desc __rte_unused,
1265                   unsigned int socket_id,
1266                   const struct rte_eth_rxconf *rx_conf __rte_unused,
1267                   struct rte_mempool *mb_pool)
1268{
1269        struct vhost_queue *vq;
1270
1271        vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1272                        RTE_CACHE_LINE_SIZE, socket_id);
1273        if (vq == NULL) {
1274                VHOST_LOG(ERR, "Failed to allocate memory for rx queue\n");
1275                return -ENOMEM;
1276        }
1277
1278        vq->mb_pool = mb_pool;
1279        vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
1280        rte_spinlock_init(&vq->intr_lock);
1281        dev->data->rx_queues[rx_queue_id] = vq;
1282
1283        return 0;
1284}
1285
1286static int
1287eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1288                   uint16_t nb_tx_desc __rte_unused,
1289                   unsigned int socket_id,
1290                   const struct rte_eth_txconf *tx_conf __rte_unused)
1291{
1292        struct vhost_queue *vq;
1293
1294        vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1295                        RTE_CACHE_LINE_SIZE, socket_id);
1296        if (vq == NULL) {
1297                VHOST_LOG(ERR, "Failed to allocate memory for tx queue\n");
1298                return -ENOMEM;
1299        }
1300
1301        vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
1302        rte_spinlock_init(&vq->intr_lock);
1303        dev->data->tx_queues[tx_queue_id] = vq;
1304
1305        return 0;
1306}
1307
1308static int
1309eth_dev_info(struct rte_eth_dev *dev,
1310             struct rte_eth_dev_info *dev_info)
1311{
1312        struct pmd_internal *internal;
1313
1314        internal = dev->data->dev_private;
1315        if (internal == NULL) {
1316                VHOST_LOG(ERR, "Invalid device specified\n");
1317                return -ENODEV;
1318        }
1319
1320        dev_info->max_mac_addrs = 1;
1321        dev_info->max_rx_pktlen = (uint32_t)-1;
1322        dev_info->max_rx_queues = internal->max_queues;
1323        dev_info->max_tx_queues = internal->max_queues;
1324        dev_info->min_rx_bufsize = 0;
1325
1326        dev_info->tx_offload_capa = RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
1327                                RTE_ETH_TX_OFFLOAD_VLAN_INSERT;
1328        if (internal->flags & RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS) {
1329                dev_info->tx_offload_capa |= RTE_ETH_TX_OFFLOAD_UDP_CKSUM |
1330                        RTE_ETH_TX_OFFLOAD_TCP_CKSUM;
1331        }
1332
1333        dev_info->rx_offload_capa = RTE_ETH_RX_OFFLOAD_VLAN_STRIP;
1334        if (internal->flags & RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS) {
1335                dev_info->rx_offload_capa |= RTE_ETH_RX_OFFLOAD_UDP_CKSUM |
1336                        RTE_ETH_RX_OFFLOAD_TCP_CKSUM;
1337        }
1338
1339        return 0;
1340}
1341
1342static int
1343eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1344{
1345        unsigned i;
1346        unsigned long rx_total = 0, tx_total = 0;
1347        unsigned long rx_total_bytes = 0, tx_total_bytes = 0;
1348        struct vhost_queue *vq;
1349
1350        for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1351                        i < dev->data->nb_rx_queues; i++) {
1352                if (dev->data->rx_queues[i] == NULL)
1353                        continue;
1354                vq = dev->data->rx_queues[i];
1355                stats->q_ipackets[i] = vq->stats.pkts;
1356                rx_total += stats->q_ipackets[i];
1357
1358                stats->q_ibytes[i] = vq->stats.bytes;
1359                rx_total_bytes += stats->q_ibytes[i];
1360        }
1361
1362        for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1363                        i < dev->data->nb_tx_queues; i++) {
1364                if (dev->data->tx_queues[i] == NULL)
1365                        continue;
1366                vq = dev->data->tx_queues[i];
1367                stats->q_opackets[i] = vq->stats.pkts;
1368                tx_total += stats->q_opackets[i];
1369
1370                stats->q_obytes[i] = vq->stats.bytes;
1371                tx_total_bytes += stats->q_obytes[i];
1372        }
1373
1374        stats->ipackets = rx_total;
1375        stats->opackets = tx_total;
1376        stats->ibytes = rx_total_bytes;
1377        stats->obytes = tx_total_bytes;
1378
1379        return 0;
1380}
1381
1382static int
1383eth_stats_reset(struct rte_eth_dev *dev)
1384{
1385        struct vhost_queue *vq;
1386        unsigned i;
1387
1388        for (i = 0; i < dev->data->nb_rx_queues; i++) {
1389                if (dev->data->rx_queues[i] == NULL)
1390                        continue;
1391                vq = dev->data->rx_queues[i];
1392                vq->stats.pkts = 0;
1393                vq->stats.bytes = 0;
1394        }
1395        for (i = 0; i < dev->data->nb_tx_queues; i++) {
1396                if (dev->data->tx_queues[i] == NULL)
1397                        continue;
1398                vq = dev->data->tx_queues[i];
1399                vq->stats.pkts = 0;
1400                vq->stats.bytes = 0;
1401                vq->stats.missed_pkts = 0;
1402        }
1403
1404        return 0;
1405}
1406
1407static void
1408eth_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
1409{
1410        rte_free(dev->data->rx_queues[qid]);
1411}
1412
1413static void
1414eth_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
1415{
1416        rte_free(dev->data->tx_queues[qid]);
1417}
1418
1419static int
1420eth_tx_done_cleanup(void *txq __rte_unused, uint32_t free_cnt __rte_unused)
1421{
1422        /*
1423         * vHost does not hang onto mbuf. eth_vhost_tx() copies packet data
1424         * and releases mbuf, so nothing to cleanup.
1425         */
1426        return 0;
1427}
1428
1429static int
1430eth_link_update(struct rte_eth_dev *dev __rte_unused,
1431                int wait_to_complete __rte_unused)
1432{
1433        return 0;
1434}
1435
1436static uint32_t
1437eth_rx_queue_count(void *rx_queue)
1438{
1439        struct vhost_queue *vq;
1440
1441        vq = rx_queue;
1442        if (vq == NULL)
1443                return 0;
1444
1445        return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
1446}
1447
1448#define CLB_VAL_IDX 0
1449#define CLB_MSK_IDX 1
1450#define CLB_MATCH_IDX 2
1451static int
1452vhost_monitor_callback(const uint64_t value,
1453                const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
1454{
1455        const uint64_t m = opaque[CLB_MSK_IDX];
1456        const uint64_t v = opaque[CLB_VAL_IDX];
1457        const uint64_t c = opaque[CLB_MATCH_IDX];
1458
1459        if (c)
1460                return (value & m) == v ? -1 : 0;
1461        else
1462                return (value & m) == v ? 0 : -1;
1463}
1464
1465static int
1466vhost_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
1467{
1468        struct vhost_queue *vq = rx_queue;
1469        struct rte_vhost_power_monitor_cond vhost_pmc;
1470        int ret;
1471        if (vq == NULL)
1472                return -EINVAL;
1473        ret = rte_vhost_get_monitor_addr(vq->vid, vq->virtqueue_id,
1474                        &vhost_pmc);
1475        if (ret < 0)
1476                return -EINVAL;
1477        pmc->addr = vhost_pmc.addr;
1478        pmc->opaque[CLB_VAL_IDX] = vhost_pmc.val;
1479        pmc->opaque[CLB_MSK_IDX] = vhost_pmc.mask;
1480        pmc->opaque[CLB_MATCH_IDX] = vhost_pmc.match;
1481        pmc->size = vhost_pmc.size;
1482        pmc->fn = vhost_monitor_callback;
1483
1484        return 0;
1485}
1486
1487static const struct eth_dev_ops ops = {
1488        .dev_start = eth_dev_start,
1489        .dev_stop = eth_dev_stop,
1490        .dev_close = eth_dev_close,
1491        .dev_configure = eth_dev_configure,
1492        .dev_infos_get = eth_dev_info,
1493        .rx_queue_setup = eth_rx_queue_setup,
1494        .tx_queue_setup = eth_tx_queue_setup,
1495        .rx_queue_release = eth_rx_queue_release,
1496        .tx_queue_release = eth_tx_queue_release,
1497        .tx_done_cleanup = eth_tx_done_cleanup,
1498        .link_update = eth_link_update,
1499        .stats_get = eth_stats_get,
1500        .stats_reset = eth_stats_reset,
1501        .xstats_reset = vhost_dev_xstats_reset,
1502        .xstats_get = vhost_dev_xstats_get,
1503        .xstats_get_names = vhost_dev_xstats_get_names,
1504        .rx_queue_intr_enable = eth_rxq_intr_enable,
1505        .rx_queue_intr_disable = eth_rxq_intr_disable,
1506        .get_monitor_addr = vhost_get_monitor_addr,
1507};
1508
1509static int
1510eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name,
1511        int16_t queues, const unsigned int numa_node, uint64_t flags,
1512        uint64_t disable_flags)
1513{
1514        const char *name = rte_vdev_device_name(dev);
1515        struct rte_eth_dev_data *data;
1516        struct pmd_internal *internal = NULL;
1517        struct rte_eth_dev *eth_dev = NULL;
1518        struct rte_ether_addr *eth_addr = NULL;
1519
1520        VHOST_LOG(INFO, "Creating VHOST-USER backend on numa socket %u\n",
1521                numa_node);
1522
1523        /* reserve an ethdev entry */
1524        eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal));
1525        if (eth_dev == NULL)
1526                goto error;
1527        data = eth_dev->data;
1528
1529        eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
1530        if (eth_addr == NULL)
1531                goto error;
1532        data->mac_addrs = eth_addr;
1533        *eth_addr = base_eth_addr;
1534        eth_addr->addr_bytes[5] = eth_dev->data->port_id;
1535
1536        /* now put it all together
1537         * - store queue data in internal,
1538         * - point eth_dev_data to internals
1539         * - and point eth_dev structure to new eth_dev_data structure
1540         */
1541        internal = eth_dev->data->dev_private;
1542        internal->iface_name = rte_malloc_socket(name, strlen(iface_name) + 1,
1543                                                 0, numa_node);
1544        if (internal->iface_name == NULL)
1545                goto error;
1546        strcpy(internal->iface_name, iface_name);
1547
1548        data->nb_rx_queues = queues;
1549        data->nb_tx_queues = queues;
1550        internal->max_queues = queues;
1551        internal->vid = -1;
1552        internal->flags = flags;
1553        internal->disable_flags = disable_flags;
1554        data->dev_link = pmd_link;
1555        data->dev_flags = RTE_ETH_DEV_INTR_LSC |
1556                                RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
1557        data->promiscuous = 1;
1558        data->all_multicast = 1;
1559
1560        eth_dev->dev_ops = &ops;
1561        eth_dev->rx_queue_count = eth_rx_queue_count;
1562
1563        /* finally assign rx and tx ops */
1564        eth_dev->rx_pkt_burst = eth_vhost_rx;
1565        eth_dev->tx_pkt_burst = eth_vhost_tx;
1566
1567        rte_eth_dev_probing_finish(eth_dev);
1568        return 0;
1569
1570error:
1571        if (internal)
1572                rte_free(internal->iface_name);
1573        rte_eth_dev_release_port(eth_dev);
1574
1575        return -1;
1576}
1577
1578static inline int
1579open_iface(const char *key __rte_unused, const char *value, void *extra_args)
1580{
1581        const char **iface_name = extra_args;
1582
1583        if (value == NULL)
1584                return -1;
1585
1586        *iface_name = value;
1587
1588        return 0;
1589}
1590
1591static inline int
1592open_int(const char *key __rte_unused, const char *value, void *extra_args)
1593{
1594        uint16_t *n = extra_args;
1595
1596        if (value == NULL || extra_args == NULL)
1597                return -EINVAL;
1598
1599        *n = (uint16_t)strtoul(value, NULL, 0);
1600        if (*n == USHRT_MAX && errno == ERANGE)
1601                return -1;
1602
1603        return 0;
1604}
1605
1606static int
1607rte_pmd_vhost_probe(struct rte_vdev_device *dev)
1608{
1609        struct rte_kvargs *kvlist = NULL;
1610        int ret = 0;
1611        char *iface_name;
1612        uint16_t queues;
1613        uint64_t flags = RTE_VHOST_USER_NET_STATS_ENABLE;
1614        uint64_t disable_flags = 0;
1615        int client_mode = 0;
1616        int iommu_support = 0;
1617        int postcopy_support = 0;
1618        int tso = 0;
1619        int linear_buf = 0;
1620        int ext_buf = 0;
1621        int legacy_ol_flags = 0;
1622        struct rte_eth_dev *eth_dev;
1623        const char *name = rte_vdev_device_name(dev);
1624
1625        VHOST_LOG(INFO, "Initializing pmd_vhost for %s\n", name);
1626
1627        if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1628                eth_dev = rte_eth_dev_attach_secondary(name);
1629                if (!eth_dev) {
1630                        VHOST_LOG(ERR, "Failed to probe %s\n", name);
1631                        return -1;
1632                }
1633                eth_dev->rx_pkt_burst = eth_vhost_rx;
1634                eth_dev->tx_pkt_burst = eth_vhost_tx;
1635                eth_dev->dev_ops = &ops;
1636                if (dev->device.numa_node == SOCKET_ID_ANY)
1637                        dev->device.numa_node = rte_socket_id();
1638                eth_dev->device = &dev->device;
1639                rte_eth_dev_probing_finish(eth_dev);
1640                return 0;
1641        }
1642
1643        kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1644        if (kvlist == NULL)
1645                return -1;
1646
1647        if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
1648                ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
1649                                         &open_iface, &iface_name);
1650                if (ret < 0)
1651                        goto out_free;
1652        } else {
1653                ret = -1;
1654                goto out_free;
1655        }
1656
1657        if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) {
1658                ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG,
1659                                         &open_int, &queues);
1660                if (ret < 0 || queues > RTE_MAX_QUEUES_PER_PORT)
1661                        goto out_free;
1662
1663        } else
1664                queues = 1;
1665
1666        if (rte_kvargs_count(kvlist, ETH_VHOST_CLIENT_ARG) == 1) {
1667                ret = rte_kvargs_process(kvlist, ETH_VHOST_CLIENT_ARG,
1668                                         &open_int, &client_mode);
1669                if (ret < 0)
1670                        goto out_free;
1671
1672                if (client_mode)
1673                        flags |= RTE_VHOST_USER_CLIENT;
1674        }
1675
1676        if (rte_kvargs_count(kvlist, ETH_VHOST_IOMMU_SUPPORT) == 1) {
1677                ret = rte_kvargs_process(kvlist, ETH_VHOST_IOMMU_SUPPORT,
1678                                         &open_int, &iommu_support);
1679                if (ret < 0)
1680                        goto out_free;
1681
1682                if (iommu_support)
1683                        flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
1684        }
1685
1686        if (rte_kvargs_count(kvlist, ETH_VHOST_POSTCOPY_SUPPORT) == 1) {
1687                ret = rte_kvargs_process(kvlist, ETH_VHOST_POSTCOPY_SUPPORT,
1688                                         &open_int, &postcopy_support);
1689                if (ret < 0)
1690                        goto out_free;
1691
1692                if (postcopy_support)
1693                        flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
1694        }
1695
1696        if (rte_kvargs_count(kvlist, ETH_VHOST_VIRTIO_NET_F_HOST_TSO) == 1) {
1697                ret = rte_kvargs_process(kvlist,
1698                                ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
1699                                &open_int, &tso);
1700                if (ret < 0)
1701                        goto out_free;
1702        }
1703
1704        if (tso == 0) {
1705                disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO4);
1706                disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO6);
1707        }
1708
1709        if (rte_kvargs_count(kvlist, ETH_VHOST_LINEAR_BUF) == 1) {
1710                ret = rte_kvargs_process(kvlist,
1711                                ETH_VHOST_LINEAR_BUF,
1712                                &open_int, &linear_buf);
1713                if (ret < 0)
1714                        goto out_free;
1715
1716                if (linear_buf == 1)
1717                        flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
1718        }
1719
1720        if (rte_kvargs_count(kvlist, ETH_VHOST_EXT_BUF) == 1) {
1721                ret = rte_kvargs_process(kvlist,
1722                                ETH_VHOST_EXT_BUF,
1723                                &open_int, &ext_buf);
1724                if (ret < 0)
1725                        goto out_free;
1726
1727                if (ext_buf == 1)
1728                        flags |= RTE_VHOST_USER_EXTBUF_SUPPORT;
1729        }
1730
1731        if (rte_kvargs_count(kvlist, ETH_VHOST_LEGACY_OL_FLAGS) == 1) {
1732                ret = rte_kvargs_process(kvlist,
1733                                ETH_VHOST_LEGACY_OL_FLAGS,
1734                                &open_int, &legacy_ol_flags);
1735                if (ret < 0)
1736                        goto out_free;
1737        }
1738
1739        if (legacy_ol_flags == 0)
1740                flags |= RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1741
1742        if (dev->device.numa_node == SOCKET_ID_ANY)
1743                dev->device.numa_node = rte_socket_id();
1744
1745        ret = eth_dev_vhost_create(dev, iface_name, queues,
1746                                   dev->device.numa_node, flags, disable_flags);
1747        if (ret == -1)
1748                VHOST_LOG(ERR, "Failed to create %s\n", name);
1749
1750out_free:
1751        rte_kvargs_free(kvlist);
1752        return ret;
1753}
1754
1755static int
1756rte_pmd_vhost_remove(struct rte_vdev_device *dev)
1757{
1758        const char *name;
1759        struct rte_eth_dev *eth_dev = NULL;
1760
1761        name = rte_vdev_device_name(dev);
1762        VHOST_LOG(INFO, "Un-Initializing pmd_vhost for %s\n", name);
1763
1764        /* find an ethdev entry */
1765        eth_dev = rte_eth_dev_allocated(name);
1766        if (eth_dev == NULL)
1767                return 0;
1768
1769        eth_dev_close(eth_dev);
1770        rte_eth_dev_release_port(eth_dev);
1771
1772        return 0;
1773}
1774
1775static struct rte_vdev_driver pmd_vhost_drv = {
1776        .probe = rte_pmd_vhost_probe,
1777        .remove = rte_pmd_vhost_remove,
1778};
1779
1780RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv);
1781RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost);
1782RTE_PMD_REGISTER_PARAM_STRING(net_vhost,
1783        "iface=<ifc> "
1784        "queues=<int> "
1785        "client=<0|1> "
1786        "iommu-support=<0|1> "
1787        "postcopy-support=<0|1> "
1788        "tso=<0|1> "
1789        "linear-buffer=<0|1> "
1790        "ext-buffer=<0|1>");
1791