dpdk/drivers/net/vhost/rte_eth_vhost.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright(c) 2016 IGEL Co., Ltd.
   3 * Copyright(c) 2016-2018 Intel Corporation
   4 */
   5#include <unistd.h>
   6#include <pthread.h>
   7#include <stdbool.h>
   8#include <sys/epoll.h>
   9
  10#include <rte_mbuf.h>
  11#include <ethdev_driver.h>
  12#include <ethdev_vdev.h>
  13#include <rte_malloc.h>
  14#include <rte_memcpy.h>
  15#include <rte_bus_vdev.h>
  16#include <rte_kvargs.h>
  17#include <rte_vhost.h>
  18#include <rte_spinlock.h>
  19
  20#include "rte_eth_vhost.h"
  21
  22RTE_LOG_REGISTER_DEFAULT(vhost_logtype, NOTICE);
  23
  24#define VHOST_LOG(level, ...) \
  25        rte_log(RTE_LOG_ ## level, vhost_logtype, __VA_ARGS__)
  26
  27enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
  28
  29#define ETH_VHOST_IFACE_ARG             "iface"
  30#define ETH_VHOST_QUEUES_ARG            "queues"
  31#define ETH_VHOST_CLIENT_ARG            "client"
  32#define ETH_VHOST_IOMMU_SUPPORT         "iommu-support"
  33#define ETH_VHOST_POSTCOPY_SUPPORT      "postcopy-support"
  34#define ETH_VHOST_VIRTIO_NET_F_HOST_TSO "tso"
  35#define ETH_VHOST_LINEAR_BUF  "linear-buffer"
  36#define ETH_VHOST_EXT_BUF  "ext-buffer"
  37#define VHOST_MAX_PKT_BURST 32
  38
  39static const char *valid_arguments[] = {
  40        ETH_VHOST_IFACE_ARG,
  41        ETH_VHOST_QUEUES_ARG,
  42        ETH_VHOST_CLIENT_ARG,
  43        ETH_VHOST_IOMMU_SUPPORT,
  44        ETH_VHOST_POSTCOPY_SUPPORT,
  45        ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
  46        ETH_VHOST_LINEAR_BUF,
  47        ETH_VHOST_EXT_BUF,
  48        NULL
  49};
  50
  51static struct rte_ether_addr base_eth_addr = {
  52        .addr_bytes = {
  53                0x56 /* V */,
  54                0x48 /* H */,
  55                0x4F /* O */,
  56                0x53 /* S */,
  57                0x54 /* T */,
  58                0x00
  59        }
  60};
  61
  62enum vhost_xstats_pkts {
  63        VHOST_UNDERSIZE_PKT = 0,
  64        VHOST_64_PKT,
  65        VHOST_65_TO_127_PKT,
  66        VHOST_128_TO_255_PKT,
  67        VHOST_256_TO_511_PKT,
  68        VHOST_512_TO_1023_PKT,
  69        VHOST_1024_TO_1522_PKT,
  70        VHOST_1523_TO_MAX_PKT,
  71        VHOST_BROADCAST_PKT,
  72        VHOST_MULTICAST_PKT,
  73        VHOST_UNICAST_PKT,
  74        VHOST_PKT,
  75        VHOST_BYTE,
  76        VHOST_MISSED_PKT,
  77        VHOST_ERRORS_PKT,
  78        VHOST_ERRORS_FRAGMENTED,
  79        VHOST_ERRORS_JABBER,
  80        VHOST_UNKNOWN_PROTOCOL,
  81        VHOST_XSTATS_MAX,
  82};
  83
  84struct vhost_stats {
  85        uint64_t pkts;
  86        uint64_t bytes;
  87        uint64_t missed_pkts;
  88        uint64_t xstats[VHOST_XSTATS_MAX];
  89};
  90
  91struct vhost_queue {
  92        int vid;
  93        rte_atomic32_t allow_queuing;
  94        rte_atomic32_t while_queuing;
  95        struct pmd_internal *internal;
  96        struct rte_mempool *mb_pool;
  97        uint16_t port;
  98        uint16_t virtqueue_id;
  99        struct vhost_stats stats;
 100        int intr_enable;
 101        rte_spinlock_t intr_lock;
 102};
 103
 104struct pmd_internal {
 105        rte_atomic32_t dev_attached;
 106        char *iface_name;
 107        uint64_t flags;
 108        uint64_t disable_flags;
 109        uint16_t max_queues;
 110        int vid;
 111        rte_atomic32_t started;
 112        uint8_t vlan_strip;
 113};
 114
 115struct internal_list {
 116        TAILQ_ENTRY(internal_list) next;
 117        struct rte_eth_dev *eth_dev;
 118};
 119
 120TAILQ_HEAD(internal_list_head, internal_list);
 121static struct internal_list_head internal_list =
 122        TAILQ_HEAD_INITIALIZER(internal_list);
 123
 124static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
 125
 126static struct rte_eth_link pmd_link = {
 127                .link_speed = 10000,
 128                .link_duplex = ETH_LINK_FULL_DUPLEX,
 129                .link_status = ETH_LINK_DOWN
 130};
 131
 132struct rte_vhost_vring_state {
 133        rte_spinlock_t lock;
 134
 135        bool cur[RTE_MAX_QUEUES_PER_PORT * 2];
 136        bool seen[RTE_MAX_QUEUES_PER_PORT * 2];
 137        unsigned int index;
 138        unsigned int max_vring;
 139};
 140
 141static struct rte_vhost_vring_state *vring_states[RTE_MAX_ETHPORTS];
 142
 143#define VHOST_XSTATS_NAME_SIZE 64
 144
 145struct vhost_xstats_name_off {
 146        char name[VHOST_XSTATS_NAME_SIZE];
 147        uint64_t offset;
 148};
 149
 150/* [rx]_is prepended to the name string here */
 151static const struct vhost_xstats_name_off vhost_rxport_stat_strings[] = {
 152        {"good_packets",
 153         offsetof(struct vhost_queue, stats.xstats[VHOST_PKT])},
 154        {"total_bytes",
 155         offsetof(struct vhost_queue, stats.xstats[VHOST_BYTE])},
 156        {"missed_pkts",
 157         offsetof(struct vhost_queue, stats.xstats[VHOST_MISSED_PKT])},
 158        {"broadcast_packets",
 159         offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
 160        {"multicast_packets",
 161         offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
 162        {"unicast_packets",
 163         offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
 164         {"undersize_packets",
 165         offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
 166        {"size_64_packets",
 167         offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
 168        {"size_65_to_127_packets",
 169         offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
 170        {"size_128_to_255_packets",
 171         offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
 172        {"size_256_to_511_packets",
 173         offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
 174        {"size_512_to_1023_packets",
 175         offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
 176        {"size_1024_to_1522_packets",
 177         offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
 178        {"size_1523_to_max_packets",
 179         offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
 180        {"errors_with_bad_CRC",
 181         offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
 182        {"fragmented_errors",
 183         offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_FRAGMENTED])},
 184        {"jabber_errors",
 185         offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_JABBER])},
 186        {"unknown_protos_packets",
 187         offsetof(struct vhost_queue, stats.xstats[VHOST_UNKNOWN_PROTOCOL])},
 188};
 189
 190/* [tx]_ is prepended to the name string here */
 191static const struct vhost_xstats_name_off vhost_txport_stat_strings[] = {
 192        {"good_packets",
 193         offsetof(struct vhost_queue, stats.xstats[VHOST_PKT])},
 194        {"total_bytes",
 195         offsetof(struct vhost_queue, stats.xstats[VHOST_BYTE])},
 196        {"missed_pkts",
 197         offsetof(struct vhost_queue, stats.xstats[VHOST_MISSED_PKT])},
 198        {"broadcast_packets",
 199         offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
 200        {"multicast_packets",
 201         offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
 202        {"unicast_packets",
 203         offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
 204        {"undersize_packets",
 205         offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
 206        {"size_64_packets",
 207         offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
 208        {"size_65_to_127_packets",
 209         offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
 210        {"size_128_to_255_packets",
 211         offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
 212        {"size_256_to_511_packets",
 213         offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
 214        {"size_512_to_1023_packets",
 215         offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
 216        {"size_1024_to_1522_packets",
 217         offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
 218        {"size_1523_to_max_packets",
 219         offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
 220        {"errors_with_bad_CRC",
 221         offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
 222};
 223
 224#define VHOST_NB_XSTATS_RXPORT (sizeof(vhost_rxport_stat_strings) / \
 225                                sizeof(vhost_rxport_stat_strings[0]))
 226
 227#define VHOST_NB_XSTATS_TXPORT (sizeof(vhost_txport_stat_strings) / \
 228                                sizeof(vhost_txport_stat_strings[0]))
 229
 230static int
 231vhost_dev_xstats_reset(struct rte_eth_dev *dev)
 232{
 233        struct vhost_queue *vq = NULL;
 234        unsigned int i = 0;
 235
 236        for (i = 0; i < dev->data->nb_rx_queues; i++) {
 237                vq = dev->data->rx_queues[i];
 238                if (!vq)
 239                        continue;
 240                memset(&vq->stats, 0, sizeof(vq->stats));
 241        }
 242        for (i = 0; i < dev->data->nb_tx_queues; i++) {
 243                vq = dev->data->tx_queues[i];
 244                if (!vq)
 245                        continue;
 246                memset(&vq->stats, 0, sizeof(vq->stats));
 247        }
 248
 249        return 0;
 250}
 251
 252static int
 253vhost_dev_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
 254                           struct rte_eth_xstat_name *xstats_names,
 255                           unsigned int limit __rte_unused)
 256{
 257        unsigned int t = 0;
 258        int count = 0;
 259        int nstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
 260
 261        if (!xstats_names)
 262                return nstats;
 263        for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
 264                snprintf(xstats_names[count].name,
 265                         sizeof(xstats_names[count].name),
 266                         "rx_%s", vhost_rxport_stat_strings[t].name);
 267                count++;
 268        }
 269        for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
 270                snprintf(xstats_names[count].name,
 271                         sizeof(xstats_names[count].name),
 272                         "tx_%s", vhost_txport_stat_strings[t].name);
 273                count++;
 274        }
 275        return count;
 276}
 277
 278static int
 279vhost_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats,
 280                     unsigned int n)
 281{
 282        unsigned int i;
 283        unsigned int t;
 284        unsigned int count = 0;
 285        struct vhost_queue *vq = NULL;
 286        unsigned int nxstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
 287
 288        if (n < nxstats)
 289                return nxstats;
 290
 291        for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
 292                xstats[count].value = 0;
 293                for (i = 0; i < dev->data->nb_rx_queues; i++) {
 294                        vq = dev->data->rx_queues[i];
 295                        if (!vq)
 296                                continue;
 297                        xstats[count].value +=
 298                                *(uint64_t *)(((char *)vq)
 299                                + vhost_rxport_stat_strings[t].offset);
 300                }
 301                xstats[count].id = count;
 302                count++;
 303        }
 304        for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
 305                xstats[count].value = 0;
 306                for (i = 0; i < dev->data->nb_tx_queues; i++) {
 307                        vq = dev->data->tx_queues[i];
 308                        if (!vq)
 309                                continue;
 310                        xstats[count].value +=
 311                                *(uint64_t *)(((char *)vq)
 312                                + vhost_txport_stat_strings[t].offset);
 313                }
 314                xstats[count].id = count;
 315                count++;
 316        }
 317        return count;
 318}
 319
 320static inline void
 321vhost_count_xcast_packets(struct vhost_queue *vq,
 322                                struct rte_mbuf *mbuf)
 323{
 324        struct rte_ether_addr *ea = NULL;
 325        struct vhost_stats *pstats = &vq->stats;
 326
 327        ea = rte_pktmbuf_mtod(mbuf, struct rte_ether_addr *);
 328        if (rte_is_multicast_ether_addr(ea)) {
 329                if (rte_is_broadcast_ether_addr(ea))
 330                        pstats->xstats[VHOST_BROADCAST_PKT]++;
 331                else
 332                        pstats->xstats[VHOST_MULTICAST_PKT]++;
 333        } else {
 334                pstats->xstats[VHOST_UNICAST_PKT]++;
 335        }
 336}
 337
 338static void
 339vhost_update_packet_xstats(struct vhost_queue *vq, struct rte_mbuf **bufs,
 340                           uint16_t count, uint64_t nb_bytes,
 341                           uint64_t nb_missed)
 342{
 343        uint32_t pkt_len = 0;
 344        uint64_t i = 0;
 345        uint64_t index;
 346        struct vhost_stats *pstats = &vq->stats;
 347
 348        pstats->xstats[VHOST_BYTE] += nb_bytes;
 349        pstats->xstats[VHOST_MISSED_PKT] += nb_missed;
 350        pstats->xstats[VHOST_UNICAST_PKT] += nb_missed;
 351
 352        for (i = 0; i < count ; i++) {
 353                pstats->xstats[VHOST_PKT]++;
 354                pkt_len = bufs[i]->pkt_len;
 355                if (pkt_len == 64) {
 356                        pstats->xstats[VHOST_64_PKT]++;
 357                } else if (pkt_len > 64 && pkt_len < 1024) {
 358                        index = (sizeof(pkt_len) * 8)
 359                                - __builtin_clz(pkt_len) - 5;
 360                        pstats->xstats[index]++;
 361                } else {
 362                        if (pkt_len < 64)
 363                                pstats->xstats[VHOST_UNDERSIZE_PKT]++;
 364                        else if (pkt_len <= 1522)
 365                                pstats->xstats[VHOST_1024_TO_1522_PKT]++;
 366                        else if (pkt_len > 1522)
 367                                pstats->xstats[VHOST_1523_TO_MAX_PKT]++;
 368                }
 369                vhost_count_xcast_packets(vq, bufs[i]);
 370        }
 371}
 372
 373static uint16_t
 374eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
 375{
 376        struct vhost_queue *r = q;
 377        uint16_t i, nb_rx = 0;
 378        uint16_t nb_receive = nb_bufs;
 379        uint64_t nb_bytes = 0;
 380
 381        if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
 382                return 0;
 383
 384        rte_atomic32_set(&r->while_queuing, 1);
 385
 386        if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
 387                goto out;
 388
 389        /* Dequeue packets from guest TX queue */
 390        while (nb_receive) {
 391                uint16_t nb_pkts;
 392                uint16_t num = (uint16_t)RTE_MIN(nb_receive,
 393                                                 VHOST_MAX_PKT_BURST);
 394
 395                nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
 396                                                  r->mb_pool, &bufs[nb_rx],
 397                                                  num);
 398
 399                nb_rx += nb_pkts;
 400                nb_receive -= nb_pkts;
 401                if (nb_pkts < num)
 402                        break;
 403        }
 404
 405        r->stats.pkts += nb_rx;
 406
 407        for (i = 0; likely(i < nb_rx); i++) {
 408                bufs[i]->port = r->port;
 409                bufs[i]->vlan_tci = 0;
 410
 411                if (r->internal->vlan_strip)
 412                        rte_vlan_strip(bufs[i]);
 413
 414                nb_bytes += bufs[i]->pkt_len;
 415        }
 416
 417        r->stats.bytes += nb_bytes;
 418        vhost_update_packet_xstats(r, bufs, nb_rx, nb_bytes, 0);
 419
 420out:
 421        rte_atomic32_set(&r->while_queuing, 0);
 422
 423        return nb_rx;
 424}
 425
 426static uint16_t
 427eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
 428{
 429        struct vhost_queue *r = q;
 430        uint16_t i, nb_tx = 0;
 431        uint16_t nb_send = 0;
 432        uint64_t nb_bytes = 0;
 433        uint64_t nb_missed = 0;
 434
 435        if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
 436                return 0;
 437
 438        rte_atomic32_set(&r->while_queuing, 1);
 439
 440        if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
 441                goto out;
 442
 443        for (i = 0; i < nb_bufs; i++) {
 444                struct rte_mbuf *m = bufs[i];
 445
 446                /* Do VLAN tag insertion */
 447                if (m->ol_flags & PKT_TX_VLAN_PKT) {
 448                        int error = rte_vlan_insert(&m);
 449                        if (unlikely(error)) {
 450                                rte_pktmbuf_free(m);
 451                                continue;
 452                        }
 453                }
 454
 455                bufs[nb_send] = m;
 456                ++nb_send;
 457        }
 458
 459        /* Enqueue packets to guest RX queue */
 460        while (nb_send) {
 461                uint16_t nb_pkts;
 462                uint16_t num = (uint16_t)RTE_MIN(nb_send,
 463                                                 VHOST_MAX_PKT_BURST);
 464
 465                nb_pkts = rte_vhost_enqueue_burst(r->vid, r->virtqueue_id,
 466                                                  &bufs[nb_tx], num);
 467
 468                nb_tx += nb_pkts;
 469                nb_send -= nb_pkts;
 470                if (nb_pkts < num)
 471                        break;
 472        }
 473
 474        for (i = 0; likely(i < nb_tx); i++)
 475                nb_bytes += bufs[i]->pkt_len;
 476
 477        nb_missed = nb_bufs - nb_tx;
 478
 479        r->stats.pkts += nb_tx;
 480        r->stats.bytes += nb_bytes;
 481        r->stats.missed_pkts += nb_bufs - nb_tx;
 482
 483        vhost_update_packet_xstats(r, bufs, nb_tx, nb_bytes, nb_missed);
 484
 485        /* According to RFC2863, ifHCOutUcastPkts, ifHCOutMulticastPkts and
 486         * ifHCOutBroadcastPkts counters are increased when packets are not
 487         * transmitted successfully.
 488         */
 489        for (i = nb_tx; i < nb_bufs; i++)
 490                vhost_count_xcast_packets(r, bufs[i]);
 491
 492        for (i = 0; likely(i < nb_tx); i++)
 493                rte_pktmbuf_free(bufs[i]);
 494out:
 495        rte_atomic32_set(&r->while_queuing, 0);
 496
 497        return nb_tx;
 498}
 499
 500static inline struct internal_list *
 501find_internal_resource(char *ifname)
 502{
 503        int found = 0;
 504        struct internal_list *list;
 505        struct pmd_internal *internal;
 506
 507        if (ifname == NULL)
 508                return NULL;
 509
 510        pthread_mutex_lock(&internal_list_lock);
 511
 512        TAILQ_FOREACH(list, &internal_list, next) {
 513                internal = list->eth_dev->data->dev_private;
 514                if (!strcmp(internal->iface_name, ifname)) {
 515                        found = 1;
 516                        break;
 517                }
 518        }
 519
 520        pthread_mutex_unlock(&internal_list_lock);
 521
 522        if (!found)
 523                return NULL;
 524
 525        return list;
 526}
 527
 528static int
 529eth_vhost_update_intr(struct rte_eth_dev *eth_dev, uint16_t rxq_idx)
 530{
 531        struct rte_intr_handle *handle = eth_dev->intr_handle;
 532        struct rte_epoll_event rev;
 533        int epfd, ret;
 534
 535        if (!handle)
 536                return 0;
 537
 538        if (handle->efds[rxq_idx] == handle->elist[rxq_idx].fd)
 539                return 0;
 540
 541        VHOST_LOG(INFO, "kickfd for rxq-%d was changed, updating handler.\n",
 542                        rxq_idx);
 543
 544        if (handle->elist[rxq_idx].fd != -1)
 545                VHOST_LOG(ERR, "Unexpected previous kickfd value (Got %d, expected -1).\n",
 546                                handle->elist[rxq_idx].fd);
 547
 548        /*
 549         * First remove invalid epoll event, and then install
 550         * the new one. May be solved with a proper API in the
 551         * future.
 552         */
 553        epfd = handle->elist[rxq_idx].epfd;
 554        rev = handle->elist[rxq_idx];
 555        ret = rte_epoll_ctl(epfd, EPOLL_CTL_DEL, rev.fd,
 556                        &handle->elist[rxq_idx]);
 557        if (ret) {
 558                VHOST_LOG(ERR, "Delete epoll event failed.\n");
 559                return ret;
 560        }
 561
 562        rev.fd = handle->efds[rxq_idx];
 563        handle->elist[rxq_idx] = rev;
 564        ret = rte_epoll_ctl(epfd, EPOLL_CTL_ADD, rev.fd,
 565                        &handle->elist[rxq_idx]);
 566        if (ret) {
 567                VHOST_LOG(ERR, "Add epoll event failed.\n");
 568                return ret;
 569        }
 570
 571        return 0;
 572}
 573
 574static int
 575eth_rxq_intr_enable(struct rte_eth_dev *dev, uint16_t qid)
 576{
 577        struct rte_vhost_vring vring;
 578        struct vhost_queue *vq;
 579        int old_intr_enable, ret = 0;
 580
 581        vq = dev->data->rx_queues[qid];
 582        if (!vq) {
 583                VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
 584                return -1;
 585        }
 586
 587        rte_spinlock_lock(&vq->intr_lock);
 588        old_intr_enable = vq->intr_enable;
 589        vq->intr_enable = 1;
 590        ret = eth_vhost_update_intr(dev, qid);
 591        rte_spinlock_unlock(&vq->intr_lock);
 592
 593        if (ret < 0) {
 594                VHOST_LOG(ERR, "Failed to update rxq%d's intr\n", qid);
 595                vq->intr_enable = old_intr_enable;
 596                return ret;
 597        }
 598
 599        ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
 600        if (ret < 0) {
 601                VHOST_LOG(ERR, "Failed to get rxq%d's vring\n", qid);
 602                return ret;
 603        }
 604        VHOST_LOG(INFO, "Enable interrupt for rxq%d\n", qid);
 605        rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 1);
 606        rte_wmb();
 607
 608        return ret;
 609}
 610
 611static int
 612eth_rxq_intr_disable(struct rte_eth_dev *dev, uint16_t qid)
 613{
 614        struct rte_vhost_vring vring;
 615        struct vhost_queue *vq;
 616        int ret = 0;
 617
 618        vq = dev->data->rx_queues[qid];
 619        if (!vq) {
 620                VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
 621                return -1;
 622        }
 623
 624        ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
 625        if (ret < 0) {
 626                VHOST_LOG(ERR, "Failed to get rxq%d's vring", qid);
 627                return ret;
 628        }
 629        VHOST_LOG(INFO, "Disable interrupt for rxq%d\n", qid);
 630        rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 0);
 631        rte_wmb();
 632
 633        vq->intr_enable = 0;
 634
 635        return 0;
 636}
 637
 638static void
 639eth_vhost_uninstall_intr(struct rte_eth_dev *dev)
 640{
 641        struct rte_intr_handle *intr_handle = dev->intr_handle;
 642
 643        if (intr_handle) {
 644                if (intr_handle->intr_vec)
 645                        free(intr_handle->intr_vec);
 646                free(intr_handle);
 647        }
 648
 649        dev->intr_handle = NULL;
 650}
 651
 652static int
 653eth_vhost_install_intr(struct rte_eth_dev *dev)
 654{
 655        struct rte_vhost_vring vring;
 656        struct vhost_queue *vq;
 657        int nb_rxq = dev->data->nb_rx_queues;
 658        int i;
 659        int ret;
 660
 661        /* uninstall firstly if we are reconnecting */
 662        if (dev->intr_handle)
 663                eth_vhost_uninstall_intr(dev);
 664
 665        dev->intr_handle = malloc(sizeof(*dev->intr_handle));
 666        if (!dev->intr_handle) {
 667                VHOST_LOG(ERR, "Fail to allocate intr_handle\n");
 668                return -ENOMEM;
 669        }
 670        memset(dev->intr_handle, 0, sizeof(*dev->intr_handle));
 671
 672        dev->intr_handle->efd_counter_size = sizeof(uint64_t);
 673
 674        dev->intr_handle->intr_vec =
 675                malloc(nb_rxq * sizeof(dev->intr_handle->intr_vec[0]));
 676
 677        if (!dev->intr_handle->intr_vec) {
 678                VHOST_LOG(ERR,
 679                        "Failed to allocate memory for interrupt vector\n");
 680                free(dev->intr_handle);
 681                return -ENOMEM;
 682        }
 683
 684        VHOST_LOG(INFO, "Prepare intr vec\n");
 685        for (i = 0; i < nb_rxq; i++) {
 686                dev->intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + i;
 687                dev->intr_handle->efds[i] = -1;
 688                vq = dev->data->rx_queues[i];
 689                if (!vq) {
 690                        VHOST_LOG(INFO, "rxq-%d not setup yet, skip!\n", i);
 691                        continue;
 692                }
 693
 694                ret = rte_vhost_get_vhost_vring(vq->vid, (i << 1) + 1, &vring);
 695                if (ret < 0) {
 696                        VHOST_LOG(INFO,
 697                                "Failed to get rxq-%d's vring, skip!\n", i);
 698                        continue;
 699                }
 700
 701                if (vring.kickfd < 0) {
 702                        VHOST_LOG(INFO,
 703                                "rxq-%d's kickfd is invalid, skip!\n", i);
 704                        continue;
 705                }
 706                dev->intr_handle->efds[i] = vring.kickfd;
 707                VHOST_LOG(INFO, "Installed intr vec for rxq-%d\n", i);
 708        }
 709
 710        dev->intr_handle->nb_efd = nb_rxq;
 711        dev->intr_handle->max_intr = nb_rxq + 1;
 712        dev->intr_handle->type = RTE_INTR_HANDLE_VDEV;
 713
 714        return 0;
 715}
 716
 717static void
 718update_queuing_status(struct rte_eth_dev *dev)
 719{
 720        struct pmd_internal *internal = dev->data->dev_private;
 721        struct vhost_queue *vq;
 722        unsigned int i;
 723        int allow_queuing = 1;
 724
 725        if (!dev->data->rx_queues || !dev->data->tx_queues)
 726                return;
 727
 728        if (rte_atomic32_read(&internal->started) == 0 ||
 729            rte_atomic32_read(&internal->dev_attached) == 0)
 730                allow_queuing = 0;
 731
 732        /* Wait until rx/tx_pkt_burst stops accessing vhost device */
 733        for (i = 0; i < dev->data->nb_rx_queues; i++) {
 734                vq = dev->data->rx_queues[i];
 735                if (vq == NULL)
 736                        continue;
 737                rte_atomic32_set(&vq->allow_queuing, allow_queuing);
 738                while (rte_atomic32_read(&vq->while_queuing))
 739                        rte_pause();
 740        }
 741
 742        for (i = 0; i < dev->data->nb_tx_queues; i++) {
 743                vq = dev->data->tx_queues[i];
 744                if (vq == NULL)
 745                        continue;
 746                rte_atomic32_set(&vq->allow_queuing, allow_queuing);
 747                while (rte_atomic32_read(&vq->while_queuing))
 748                        rte_pause();
 749        }
 750}
 751
 752static void
 753queue_setup(struct rte_eth_dev *eth_dev, struct pmd_internal *internal)
 754{
 755        struct vhost_queue *vq;
 756        int i;
 757
 758        for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
 759                vq = eth_dev->data->rx_queues[i];
 760                if (!vq)
 761                        continue;
 762                vq->vid = internal->vid;
 763                vq->internal = internal;
 764                vq->port = eth_dev->data->port_id;
 765        }
 766        for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
 767                vq = eth_dev->data->tx_queues[i];
 768                if (!vq)
 769                        continue;
 770                vq->vid = internal->vid;
 771                vq->internal = internal;
 772                vq->port = eth_dev->data->port_id;
 773        }
 774}
 775
 776static int
 777new_device(int vid)
 778{
 779        struct rte_eth_dev *eth_dev;
 780        struct internal_list *list;
 781        struct pmd_internal *internal;
 782        struct rte_eth_conf *dev_conf;
 783        unsigned i;
 784        char ifname[PATH_MAX];
 785#ifdef RTE_LIBRTE_VHOST_NUMA
 786        int newnode;
 787#endif
 788
 789        rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
 790        list = find_internal_resource(ifname);
 791        if (list == NULL) {
 792                VHOST_LOG(INFO, "Invalid device name: %s\n", ifname);
 793                return -1;
 794        }
 795
 796        eth_dev = list->eth_dev;
 797        internal = eth_dev->data->dev_private;
 798        dev_conf = &eth_dev->data->dev_conf;
 799
 800#ifdef RTE_LIBRTE_VHOST_NUMA
 801        newnode = rte_vhost_get_numa_node(vid);
 802        if (newnode >= 0)
 803                eth_dev->data->numa_node = newnode;
 804#endif
 805
 806        internal->vid = vid;
 807        if (rte_atomic32_read(&internal->started) == 1) {
 808                queue_setup(eth_dev, internal);
 809
 810                if (dev_conf->intr_conf.rxq) {
 811                        if (eth_vhost_install_intr(eth_dev) < 0) {
 812                                VHOST_LOG(INFO,
 813                                        "Failed to install interrupt handler.");
 814                                        return -1;
 815                        }
 816                }
 817        } else {
 818                VHOST_LOG(INFO, "RX/TX queues not exist yet\n");
 819        }
 820
 821        for (i = 0; i < rte_vhost_get_vring_num(vid); i++)
 822                rte_vhost_enable_guest_notification(vid, i, 0);
 823
 824        rte_vhost_get_mtu(vid, &eth_dev->data->mtu);
 825
 826        eth_dev->data->dev_link.link_status = ETH_LINK_UP;
 827
 828        rte_atomic32_set(&internal->dev_attached, 1);
 829        update_queuing_status(eth_dev);
 830
 831        VHOST_LOG(INFO, "Vhost device %d created\n", vid);
 832
 833        rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
 834
 835        return 0;
 836}
 837
 838static void
 839destroy_device(int vid)
 840{
 841        struct rte_eth_dev *eth_dev;
 842        struct pmd_internal *internal;
 843        struct vhost_queue *vq;
 844        struct internal_list *list;
 845        char ifname[PATH_MAX];
 846        unsigned i;
 847        struct rte_vhost_vring_state *state;
 848
 849        rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
 850        list = find_internal_resource(ifname);
 851        if (list == NULL) {
 852                VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
 853                return;
 854        }
 855        eth_dev = list->eth_dev;
 856        internal = eth_dev->data->dev_private;
 857
 858        rte_atomic32_set(&internal->dev_attached, 0);
 859        update_queuing_status(eth_dev);
 860
 861        eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
 862
 863        if (eth_dev->data->rx_queues && eth_dev->data->tx_queues) {
 864                for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
 865                        vq = eth_dev->data->rx_queues[i];
 866                        if (!vq)
 867                                continue;
 868                        vq->vid = -1;
 869                }
 870                for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
 871                        vq = eth_dev->data->tx_queues[i];
 872                        if (!vq)
 873                                continue;
 874                        vq->vid = -1;
 875                }
 876        }
 877
 878        state = vring_states[eth_dev->data->port_id];
 879        rte_spinlock_lock(&state->lock);
 880        for (i = 0; i <= state->max_vring; i++) {
 881                state->cur[i] = false;
 882                state->seen[i] = false;
 883        }
 884        state->max_vring = 0;
 885        rte_spinlock_unlock(&state->lock);
 886
 887        VHOST_LOG(INFO, "Vhost device %d destroyed\n", vid);
 888        eth_vhost_uninstall_intr(eth_dev);
 889
 890        rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
 891}
 892
 893static int
 894vring_conf_update(int vid, struct rte_eth_dev *eth_dev, uint16_t vring_id)
 895{
 896        struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
 897        struct pmd_internal *internal = eth_dev->data->dev_private;
 898        struct vhost_queue *vq;
 899        struct rte_vhost_vring vring;
 900        int rx_idx = vring_id % 2 ? (vring_id - 1) >> 1 : -1;
 901        int ret = 0;
 902
 903        /*
 904         * The vring kickfd may be changed after the new device notification.
 905         * Update it when the vring state is updated.
 906         */
 907        if (rx_idx >= 0 && rx_idx < eth_dev->data->nb_rx_queues &&
 908            rte_atomic32_read(&internal->dev_attached) &&
 909            rte_atomic32_read(&internal->started) &&
 910            dev_conf->intr_conf.rxq) {
 911                ret = rte_vhost_get_vhost_vring(vid, vring_id, &vring);
 912                if (ret) {
 913                        VHOST_LOG(ERR, "Failed to get vring %d information.\n",
 914                                        vring_id);
 915                        return ret;
 916                }
 917                eth_dev->intr_handle->efds[rx_idx] = vring.kickfd;
 918
 919                vq = eth_dev->data->rx_queues[rx_idx];
 920                if (!vq) {
 921                        VHOST_LOG(ERR, "rxq%d is not setup yet\n", rx_idx);
 922                        return -1;
 923                }
 924
 925                rte_spinlock_lock(&vq->intr_lock);
 926                if (vq->intr_enable)
 927                        ret = eth_vhost_update_intr(eth_dev, rx_idx);
 928                rte_spinlock_unlock(&vq->intr_lock);
 929        }
 930
 931        return ret;
 932}
 933
 934static int
 935vring_state_changed(int vid, uint16_t vring, int enable)
 936{
 937        struct rte_vhost_vring_state *state;
 938        struct rte_eth_dev *eth_dev;
 939        struct internal_list *list;
 940        char ifname[PATH_MAX];
 941
 942        rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
 943        list = find_internal_resource(ifname);
 944        if (list == NULL) {
 945                VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
 946                return -1;
 947        }
 948
 949        eth_dev = list->eth_dev;
 950        /* won't be NULL */
 951        state = vring_states[eth_dev->data->port_id];
 952
 953        if (enable && vring_conf_update(vid, eth_dev, vring))
 954                VHOST_LOG(INFO, "Failed to update vring-%d configuration.\n",
 955                          (int)vring);
 956
 957        rte_spinlock_lock(&state->lock);
 958        if (state->cur[vring] == enable) {
 959                rte_spinlock_unlock(&state->lock);
 960                return 0;
 961        }
 962        state->cur[vring] = enable;
 963        state->max_vring = RTE_MAX(vring, state->max_vring);
 964        rte_spinlock_unlock(&state->lock);
 965
 966        VHOST_LOG(INFO, "vring%u is %s\n",
 967                        vring, enable ? "enabled" : "disabled");
 968
 969        rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_QUEUE_STATE, NULL);
 970
 971        return 0;
 972}
 973
 974static struct vhost_device_ops vhost_ops = {
 975        .new_device          = new_device,
 976        .destroy_device      = destroy_device,
 977        .vring_state_changed = vring_state_changed,
 978};
 979
 980static int
 981vhost_driver_setup(struct rte_eth_dev *eth_dev)
 982{
 983        struct pmd_internal *internal = eth_dev->data->dev_private;
 984        struct internal_list *list = NULL;
 985        struct rte_vhost_vring_state *vring_state = NULL;
 986        unsigned int numa_node = eth_dev->device->numa_node;
 987        const char *name = eth_dev->device->name;
 988
 989        /* Don't try to setup again if it has already been done. */
 990        list = find_internal_resource(internal->iface_name);
 991        if (list)
 992                return 0;
 993
 994        list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node);
 995        if (list == NULL)
 996                return -1;
 997
 998        vring_state = rte_zmalloc_socket(name, sizeof(*vring_state),
 999                                         0, numa_node);
1000        if (vring_state == NULL)
1001                goto free_list;
1002
1003        list->eth_dev = eth_dev;
1004        pthread_mutex_lock(&internal_list_lock);
1005        TAILQ_INSERT_TAIL(&internal_list, list, next);
1006        pthread_mutex_unlock(&internal_list_lock);
1007
1008        rte_spinlock_init(&vring_state->lock);
1009        vring_states[eth_dev->data->port_id] = vring_state;
1010
1011        if (rte_vhost_driver_register(internal->iface_name, internal->flags))
1012                goto list_remove;
1013
1014        if (internal->disable_flags) {
1015                if (rte_vhost_driver_disable_features(internal->iface_name,
1016                                                      internal->disable_flags))
1017                        goto drv_unreg;
1018        }
1019
1020        if (rte_vhost_driver_callback_register(internal->iface_name,
1021                                               &vhost_ops) < 0) {
1022                VHOST_LOG(ERR, "Can't register callbacks\n");
1023                goto drv_unreg;
1024        }
1025
1026        if (rte_vhost_driver_start(internal->iface_name) < 0) {
1027                VHOST_LOG(ERR, "Failed to start driver for %s\n",
1028                          internal->iface_name);
1029                goto drv_unreg;
1030        }
1031
1032        return 0;
1033
1034drv_unreg:
1035        rte_vhost_driver_unregister(internal->iface_name);
1036list_remove:
1037        vring_states[eth_dev->data->port_id] = NULL;
1038        pthread_mutex_lock(&internal_list_lock);
1039        TAILQ_REMOVE(&internal_list, list, next);
1040        pthread_mutex_unlock(&internal_list_lock);
1041        rte_free(vring_state);
1042free_list:
1043        rte_free(list);
1044
1045        return -1;
1046}
1047
1048int
1049rte_eth_vhost_get_queue_event(uint16_t port_id,
1050                struct rte_eth_vhost_queue_event *event)
1051{
1052        struct rte_vhost_vring_state *state;
1053        unsigned int i;
1054        int idx;
1055
1056        if (port_id >= RTE_MAX_ETHPORTS) {
1057                VHOST_LOG(ERR, "Invalid port id\n");
1058                return -1;
1059        }
1060
1061        state = vring_states[port_id];
1062        if (!state) {
1063                VHOST_LOG(ERR, "Unused port\n");
1064                return -1;
1065        }
1066
1067        rte_spinlock_lock(&state->lock);
1068        for (i = 0; i <= state->max_vring; i++) {
1069                idx = state->index++ % (state->max_vring + 1);
1070
1071                if (state->cur[idx] != state->seen[idx]) {
1072                        state->seen[idx] = state->cur[idx];
1073                        event->queue_id = idx / 2;
1074                        event->rx = idx & 1;
1075                        event->enable = state->cur[idx];
1076                        rte_spinlock_unlock(&state->lock);
1077                        return 0;
1078                }
1079        }
1080        rte_spinlock_unlock(&state->lock);
1081
1082        return -1;
1083}
1084
1085int
1086rte_eth_vhost_get_vid_from_port_id(uint16_t port_id)
1087{
1088        struct internal_list *list;
1089        struct rte_eth_dev *eth_dev;
1090        struct vhost_queue *vq;
1091        int vid = -1;
1092
1093        if (!rte_eth_dev_is_valid_port(port_id))
1094                return -1;
1095
1096        pthread_mutex_lock(&internal_list_lock);
1097
1098        TAILQ_FOREACH(list, &internal_list, next) {
1099                eth_dev = list->eth_dev;
1100                if (eth_dev->data->port_id == port_id) {
1101                        vq = eth_dev->data->rx_queues[0];
1102                        if (vq) {
1103                                vid = vq->vid;
1104                        }
1105                        break;
1106                }
1107        }
1108
1109        pthread_mutex_unlock(&internal_list_lock);
1110
1111        return vid;
1112}
1113
1114static int
1115eth_dev_configure(struct rte_eth_dev *dev)
1116{
1117        struct pmd_internal *internal = dev->data->dev_private;
1118        const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
1119
1120        /* NOTE: the same process has to operate a vhost interface
1121         * from beginning to end (from eth_dev configure to eth_dev close).
1122         * It is user's responsibility at the moment.
1123         */
1124        if (vhost_driver_setup(dev) < 0)
1125                return -1;
1126
1127        internal->vlan_strip = !!(rxmode->offloads & DEV_RX_OFFLOAD_VLAN_STRIP);
1128
1129        return 0;
1130}
1131
1132static int
1133eth_dev_start(struct rte_eth_dev *eth_dev)
1134{
1135        struct pmd_internal *internal = eth_dev->data->dev_private;
1136        struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
1137
1138        queue_setup(eth_dev, internal);
1139
1140        if (rte_atomic32_read(&internal->dev_attached) == 1) {
1141                if (dev_conf->intr_conf.rxq) {
1142                        if (eth_vhost_install_intr(eth_dev) < 0) {
1143                                VHOST_LOG(INFO,
1144                                        "Failed to install interrupt handler.");
1145                                        return -1;
1146                        }
1147                }
1148        }
1149
1150        rte_atomic32_set(&internal->started, 1);
1151        update_queuing_status(eth_dev);
1152
1153        return 0;
1154}
1155
1156static int
1157eth_dev_stop(struct rte_eth_dev *dev)
1158{
1159        struct pmd_internal *internal = dev->data->dev_private;
1160
1161        dev->data->dev_started = 0;
1162        rte_atomic32_set(&internal->started, 0);
1163        update_queuing_status(dev);
1164
1165        return 0;
1166}
1167
1168static int
1169eth_dev_close(struct rte_eth_dev *dev)
1170{
1171        struct pmd_internal *internal;
1172        struct internal_list *list;
1173        unsigned int i, ret;
1174
1175        if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1176                return 0;
1177
1178        internal = dev->data->dev_private;
1179        if (!internal)
1180                return 0;
1181
1182        ret = eth_dev_stop(dev);
1183
1184        list = find_internal_resource(internal->iface_name);
1185        if (list) {
1186                rte_vhost_driver_unregister(internal->iface_name);
1187                pthread_mutex_lock(&internal_list_lock);
1188                TAILQ_REMOVE(&internal_list, list, next);
1189                pthread_mutex_unlock(&internal_list_lock);
1190                rte_free(list);
1191        }
1192
1193        if (dev->data->rx_queues)
1194                for (i = 0; i < dev->data->nb_rx_queues; i++)
1195                        rte_free(dev->data->rx_queues[i]);
1196
1197        if (dev->data->tx_queues)
1198                for (i = 0; i < dev->data->nb_tx_queues; i++)
1199                        rte_free(dev->data->tx_queues[i]);
1200
1201        rte_free(internal->iface_name);
1202        rte_free(internal);
1203
1204        dev->data->dev_private = NULL;
1205
1206        rte_free(vring_states[dev->data->port_id]);
1207        vring_states[dev->data->port_id] = NULL;
1208
1209        return ret;
1210}
1211
1212static int
1213eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1214                   uint16_t nb_rx_desc __rte_unused,
1215                   unsigned int socket_id,
1216                   const struct rte_eth_rxconf *rx_conf __rte_unused,
1217                   struct rte_mempool *mb_pool)
1218{
1219        struct vhost_queue *vq;
1220
1221        vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1222                        RTE_CACHE_LINE_SIZE, socket_id);
1223        if (vq == NULL) {
1224                VHOST_LOG(ERR, "Failed to allocate memory for rx queue\n");
1225                return -ENOMEM;
1226        }
1227
1228        vq->mb_pool = mb_pool;
1229        vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
1230        rte_spinlock_init(&vq->intr_lock);
1231        dev->data->rx_queues[rx_queue_id] = vq;
1232
1233        return 0;
1234}
1235
1236static int
1237eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1238                   uint16_t nb_tx_desc __rte_unused,
1239                   unsigned int socket_id,
1240                   const struct rte_eth_txconf *tx_conf __rte_unused)
1241{
1242        struct vhost_queue *vq;
1243
1244        vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1245                        RTE_CACHE_LINE_SIZE, socket_id);
1246        if (vq == NULL) {
1247                VHOST_LOG(ERR, "Failed to allocate memory for tx queue\n");
1248                return -ENOMEM;
1249        }
1250
1251        vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
1252        rte_spinlock_init(&vq->intr_lock);
1253        dev->data->tx_queues[tx_queue_id] = vq;
1254
1255        return 0;
1256}
1257
1258static int
1259eth_dev_info(struct rte_eth_dev *dev,
1260             struct rte_eth_dev_info *dev_info)
1261{
1262        struct pmd_internal *internal;
1263
1264        internal = dev->data->dev_private;
1265        if (internal == NULL) {
1266                VHOST_LOG(ERR, "Invalid device specified\n");
1267                return -ENODEV;
1268        }
1269
1270        dev_info->max_mac_addrs = 1;
1271        dev_info->max_rx_pktlen = (uint32_t)-1;
1272        dev_info->max_rx_queues = internal->max_queues;
1273        dev_info->max_tx_queues = internal->max_queues;
1274        dev_info->min_rx_bufsize = 0;
1275
1276        dev_info->tx_offload_capa = DEV_TX_OFFLOAD_MULTI_SEGS |
1277                                DEV_TX_OFFLOAD_VLAN_INSERT;
1278        dev_info->rx_offload_capa = DEV_RX_OFFLOAD_VLAN_STRIP;
1279
1280        return 0;
1281}
1282
1283static int
1284eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1285{
1286        unsigned i;
1287        unsigned long rx_total = 0, tx_total = 0;
1288        unsigned long rx_total_bytes = 0, tx_total_bytes = 0;
1289        struct vhost_queue *vq;
1290
1291        for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1292                        i < dev->data->nb_rx_queues; i++) {
1293                if (dev->data->rx_queues[i] == NULL)
1294                        continue;
1295                vq = dev->data->rx_queues[i];
1296                stats->q_ipackets[i] = vq->stats.pkts;
1297                rx_total += stats->q_ipackets[i];
1298
1299                stats->q_ibytes[i] = vq->stats.bytes;
1300                rx_total_bytes += stats->q_ibytes[i];
1301        }
1302
1303        for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1304                        i < dev->data->nb_tx_queues; i++) {
1305                if (dev->data->tx_queues[i] == NULL)
1306                        continue;
1307                vq = dev->data->tx_queues[i];
1308                stats->q_opackets[i] = vq->stats.pkts;
1309                tx_total += stats->q_opackets[i];
1310
1311                stats->q_obytes[i] = vq->stats.bytes;
1312                tx_total_bytes += stats->q_obytes[i];
1313        }
1314
1315        stats->ipackets = rx_total;
1316        stats->opackets = tx_total;
1317        stats->ibytes = rx_total_bytes;
1318        stats->obytes = tx_total_bytes;
1319
1320        return 0;
1321}
1322
1323static int
1324eth_stats_reset(struct rte_eth_dev *dev)
1325{
1326        struct vhost_queue *vq;
1327        unsigned i;
1328
1329        for (i = 0; i < dev->data->nb_rx_queues; i++) {
1330                if (dev->data->rx_queues[i] == NULL)
1331                        continue;
1332                vq = dev->data->rx_queues[i];
1333                vq->stats.pkts = 0;
1334                vq->stats.bytes = 0;
1335        }
1336        for (i = 0; i < dev->data->nb_tx_queues; i++) {
1337                if (dev->data->tx_queues[i] == NULL)
1338                        continue;
1339                vq = dev->data->tx_queues[i];
1340                vq->stats.pkts = 0;
1341                vq->stats.bytes = 0;
1342                vq->stats.missed_pkts = 0;
1343        }
1344
1345        return 0;
1346}
1347
1348static void
1349eth_queue_release(void *q)
1350{
1351        rte_free(q);
1352}
1353
1354static int
1355eth_tx_done_cleanup(void *txq __rte_unused, uint32_t free_cnt __rte_unused)
1356{
1357        /*
1358         * vHost does not hang onto mbuf. eth_vhost_tx() copies packet data
1359         * and releases mbuf, so nothing to cleanup.
1360         */
1361        return 0;
1362}
1363
1364static int
1365eth_link_update(struct rte_eth_dev *dev __rte_unused,
1366                int wait_to_complete __rte_unused)
1367{
1368        return 0;
1369}
1370
1371static uint32_t
1372eth_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1373{
1374        struct vhost_queue *vq;
1375
1376        vq = dev->data->rx_queues[rx_queue_id];
1377        if (vq == NULL)
1378                return 0;
1379
1380        return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
1381}
1382
1383static const struct eth_dev_ops ops = {
1384        .dev_start = eth_dev_start,
1385        .dev_stop = eth_dev_stop,
1386        .dev_close = eth_dev_close,
1387        .dev_configure = eth_dev_configure,
1388        .dev_infos_get = eth_dev_info,
1389        .rx_queue_setup = eth_rx_queue_setup,
1390        .tx_queue_setup = eth_tx_queue_setup,
1391        .rx_queue_release = eth_queue_release,
1392        .tx_queue_release = eth_queue_release,
1393        .tx_done_cleanup = eth_tx_done_cleanup,
1394        .link_update = eth_link_update,
1395        .stats_get = eth_stats_get,
1396        .stats_reset = eth_stats_reset,
1397        .xstats_reset = vhost_dev_xstats_reset,
1398        .xstats_get = vhost_dev_xstats_get,
1399        .xstats_get_names = vhost_dev_xstats_get_names,
1400        .rx_queue_intr_enable = eth_rxq_intr_enable,
1401        .rx_queue_intr_disable = eth_rxq_intr_disable,
1402};
1403
1404static int
1405eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name,
1406        int16_t queues, const unsigned int numa_node, uint64_t flags,
1407        uint64_t disable_flags)
1408{
1409        const char *name = rte_vdev_device_name(dev);
1410        struct rte_eth_dev_data *data;
1411        struct pmd_internal *internal = NULL;
1412        struct rte_eth_dev *eth_dev = NULL;
1413        struct rte_ether_addr *eth_addr = NULL;
1414
1415        VHOST_LOG(INFO, "Creating VHOST-USER backend on numa socket %u\n",
1416                numa_node);
1417
1418        /* reserve an ethdev entry */
1419        eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal));
1420        if (eth_dev == NULL)
1421                goto error;
1422        data = eth_dev->data;
1423
1424        eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
1425        if (eth_addr == NULL)
1426                goto error;
1427        data->mac_addrs = eth_addr;
1428        *eth_addr = base_eth_addr;
1429        eth_addr->addr_bytes[5] = eth_dev->data->port_id;
1430
1431        /* now put it all together
1432         * - store queue data in internal,
1433         * - point eth_dev_data to internals
1434         * - and point eth_dev structure to new eth_dev_data structure
1435         */
1436        internal = eth_dev->data->dev_private;
1437        internal->iface_name = rte_malloc_socket(name, strlen(iface_name) + 1,
1438                                                 0, numa_node);
1439        if (internal->iface_name == NULL)
1440                goto error;
1441        strcpy(internal->iface_name, iface_name);
1442
1443        data->nb_rx_queues = queues;
1444        data->nb_tx_queues = queues;
1445        internal->max_queues = queues;
1446        internal->vid = -1;
1447        internal->flags = flags;
1448        internal->disable_flags = disable_flags;
1449        data->dev_link = pmd_link;
1450        data->dev_flags = RTE_ETH_DEV_INTR_LSC |
1451                                RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
1452        data->promiscuous = 1;
1453        data->all_multicast = 1;
1454
1455        eth_dev->dev_ops = &ops;
1456        eth_dev->rx_queue_count = eth_rx_queue_count;
1457
1458        /* finally assign rx and tx ops */
1459        eth_dev->rx_pkt_burst = eth_vhost_rx;
1460        eth_dev->tx_pkt_burst = eth_vhost_tx;
1461
1462        rte_eth_dev_probing_finish(eth_dev);
1463        return 0;
1464
1465error:
1466        if (internal)
1467                rte_free(internal->iface_name);
1468        rte_eth_dev_release_port(eth_dev);
1469
1470        return -1;
1471}
1472
1473static inline int
1474open_iface(const char *key __rte_unused, const char *value, void *extra_args)
1475{
1476        const char **iface_name = extra_args;
1477
1478        if (value == NULL)
1479                return -1;
1480
1481        *iface_name = value;
1482
1483        return 0;
1484}
1485
1486static inline int
1487open_int(const char *key __rte_unused, const char *value, void *extra_args)
1488{
1489        uint16_t *n = extra_args;
1490
1491        if (value == NULL || extra_args == NULL)
1492                return -EINVAL;
1493
1494        *n = (uint16_t)strtoul(value, NULL, 0);
1495        if (*n == USHRT_MAX && errno == ERANGE)
1496                return -1;
1497
1498        return 0;
1499}
1500
1501static int
1502rte_pmd_vhost_probe(struct rte_vdev_device *dev)
1503{
1504        struct rte_kvargs *kvlist = NULL;
1505        int ret = 0;
1506        char *iface_name;
1507        uint16_t queues;
1508        uint64_t flags = 0;
1509        uint64_t disable_flags = 0;
1510        int client_mode = 0;
1511        int iommu_support = 0;
1512        int postcopy_support = 0;
1513        int tso = 0;
1514        int linear_buf = 0;
1515        int ext_buf = 0;
1516        struct rte_eth_dev *eth_dev;
1517        const char *name = rte_vdev_device_name(dev);
1518
1519        VHOST_LOG(INFO, "Initializing pmd_vhost for %s\n", name);
1520
1521        if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1522                eth_dev = rte_eth_dev_attach_secondary(name);
1523                if (!eth_dev) {
1524                        VHOST_LOG(ERR, "Failed to probe %s\n", name);
1525                        return -1;
1526                }
1527                eth_dev->rx_pkt_burst = eth_vhost_rx;
1528                eth_dev->tx_pkt_burst = eth_vhost_tx;
1529                eth_dev->dev_ops = &ops;
1530                if (dev->device.numa_node == SOCKET_ID_ANY)
1531                        dev->device.numa_node = rte_socket_id();
1532                eth_dev->device = &dev->device;
1533                rte_eth_dev_probing_finish(eth_dev);
1534                return 0;
1535        }
1536
1537        kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1538        if (kvlist == NULL)
1539                return -1;
1540
1541        if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
1542                ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
1543                                         &open_iface, &iface_name);
1544                if (ret < 0)
1545                        goto out_free;
1546        } else {
1547                ret = -1;
1548                goto out_free;
1549        }
1550
1551        if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) {
1552                ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG,
1553                                         &open_int, &queues);
1554                if (ret < 0 || queues > RTE_MAX_QUEUES_PER_PORT)
1555                        goto out_free;
1556
1557        } else
1558                queues = 1;
1559
1560        if (rte_kvargs_count(kvlist, ETH_VHOST_CLIENT_ARG) == 1) {
1561                ret = rte_kvargs_process(kvlist, ETH_VHOST_CLIENT_ARG,
1562                                         &open_int, &client_mode);
1563                if (ret < 0)
1564                        goto out_free;
1565
1566                if (client_mode)
1567                        flags |= RTE_VHOST_USER_CLIENT;
1568        }
1569
1570        if (rte_kvargs_count(kvlist, ETH_VHOST_IOMMU_SUPPORT) == 1) {
1571                ret = rte_kvargs_process(kvlist, ETH_VHOST_IOMMU_SUPPORT,
1572                                         &open_int, &iommu_support);
1573                if (ret < 0)
1574                        goto out_free;
1575
1576                if (iommu_support)
1577                        flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
1578        }
1579
1580        if (rte_kvargs_count(kvlist, ETH_VHOST_POSTCOPY_SUPPORT) == 1) {
1581                ret = rte_kvargs_process(kvlist, ETH_VHOST_POSTCOPY_SUPPORT,
1582                                         &open_int, &postcopy_support);
1583                if (ret < 0)
1584                        goto out_free;
1585
1586                if (postcopy_support)
1587                        flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
1588        }
1589
1590        if (rte_kvargs_count(kvlist, ETH_VHOST_VIRTIO_NET_F_HOST_TSO) == 1) {
1591                ret = rte_kvargs_process(kvlist,
1592                                ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
1593                                &open_int, &tso);
1594                if (ret < 0)
1595                        goto out_free;
1596
1597                if (tso == 0) {
1598                        disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO4);
1599                        disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO6);
1600                }
1601        }
1602
1603        if (rte_kvargs_count(kvlist, ETH_VHOST_LINEAR_BUF) == 1) {
1604                ret = rte_kvargs_process(kvlist,
1605                                ETH_VHOST_LINEAR_BUF,
1606                                &open_int, &linear_buf);
1607                if (ret < 0)
1608                        goto out_free;
1609
1610                if (linear_buf == 1)
1611                        flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
1612        }
1613
1614        if (rte_kvargs_count(kvlist, ETH_VHOST_EXT_BUF) == 1) {
1615                ret = rte_kvargs_process(kvlist,
1616                                ETH_VHOST_EXT_BUF,
1617                                &open_int, &ext_buf);
1618                if (ret < 0)
1619                        goto out_free;
1620
1621                if (ext_buf == 1)
1622                        flags |= RTE_VHOST_USER_EXTBUF_SUPPORT;
1623        }
1624
1625        if (dev->device.numa_node == SOCKET_ID_ANY)
1626                dev->device.numa_node = rte_socket_id();
1627
1628        ret = eth_dev_vhost_create(dev, iface_name, queues,
1629                                   dev->device.numa_node, flags, disable_flags);
1630        if (ret == -1)
1631                VHOST_LOG(ERR, "Failed to create %s\n", name);
1632
1633out_free:
1634        rte_kvargs_free(kvlist);
1635        return ret;
1636}
1637
1638static int
1639rte_pmd_vhost_remove(struct rte_vdev_device *dev)
1640{
1641        const char *name;
1642        struct rte_eth_dev *eth_dev = NULL;
1643
1644        name = rte_vdev_device_name(dev);
1645        VHOST_LOG(INFO, "Un-Initializing pmd_vhost for %s\n", name);
1646
1647        /* find an ethdev entry */
1648        eth_dev = rte_eth_dev_allocated(name);
1649        if (eth_dev == NULL)
1650                return 0;
1651
1652        eth_dev_close(eth_dev);
1653        rte_eth_dev_release_port(eth_dev);
1654
1655        return 0;
1656}
1657
1658static struct rte_vdev_driver pmd_vhost_drv = {
1659        .probe = rte_pmd_vhost_probe,
1660        .remove = rte_pmd_vhost_remove,
1661};
1662
1663RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv);
1664RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost);
1665RTE_PMD_REGISTER_PARAM_STRING(net_vhost,
1666        "iface=<ifc> "
1667        "queues=<int> "
1668        "client=<0|1> "
1669        "iommu-support=<0|1> "
1670        "postcopy-support=<0|1> "
1671        "tso=<0|1> "
1672        "linear-buffer=<0|1> "
1673        "ext-buffer=<0|1>");
1674