dpdk/drivers/vdpa/ifc/ifcvf_vdpa.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright(c) 2018 Intel Corporation
   3 */
   4
   5#include <unistd.h>
   6#include <pthread.h>
   7#include <fcntl.h>
   8#include <string.h>
   9#include <sys/ioctl.h>
  10#include <sys/epoll.h>
  11#include <linux/virtio_net.h>
  12#include <stdbool.h>
  13
  14#include <rte_eal_paging.h>
  15#include <rte_malloc.h>
  16#include <rte_memory.h>
  17#include <rte_bus_pci.h>
  18#include <rte_vhost.h>
  19#include <rte_vdpa.h>
  20#include <rte_vdpa_dev.h>
  21#include <rte_vfio.h>
  22#include <rte_spinlock.h>
  23#include <rte_log.h>
  24#include <rte_kvargs.h>
  25#include <rte_devargs.h>
  26
  27#include "base/ifcvf.h"
  28
  29RTE_LOG_REGISTER(ifcvf_vdpa_logtype, pmd.vdpa.ifcvf, NOTICE);
  30#define DRV_LOG(level, fmt, args...) \
  31        rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
  32                "IFCVF %s(): " fmt "\n", __func__, ##args)
  33
  34#define IFCVF_USED_RING_LEN(size) \
  35        ((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3)
  36
  37#define IFCVF_VDPA_MODE         "vdpa"
  38#define IFCVF_SW_FALLBACK_LM    "sw-live-migration"
  39
  40#define THREAD_NAME_LEN 16
  41
  42static const char * const ifcvf_valid_arguments[] = {
  43        IFCVF_VDPA_MODE,
  44        IFCVF_SW_FALLBACK_LM,
  45        NULL
  46};
  47
  48struct ifcvf_internal {
  49        struct rte_pci_device *pdev;
  50        struct ifcvf_hw hw;
  51        int configured;
  52        int vfio_container_fd;
  53        int vfio_group_fd;
  54        int vfio_dev_fd;
  55        pthread_t tid;  /* thread for notify relay */
  56        int epfd;
  57        int vid;
  58        struct rte_vdpa_device *vdev;
  59        uint16_t max_queues;
  60        uint64_t features;
  61        rte_atomic32_t started;
  62        rte_atomic32_t dev_attached;
  63        rte_atomic32_t running;
  64        rte_spinlock_t lock;
  65        bool sw_lm;
  66        bool sw_fallback_running;
  67        /* mediated vring for sw fallback */
  68        struct vring m_vring[IFCVF_MAX_QUEUES * 2];
  69        /* eventfd for used ring interrupt */
  70        int intr_fd[IFCVF_MAX_QUEUES * 2];
  71};
  72
  73struct internal_list {
  74        TAILQ_ENTRY(internal_list) next;
  75        struct ifcvf_internal *internal;
  76};
  77
  78TAILQ_HEAD(internal_list_head, internal_list);
  79static struct internal_list_head internal_list =
  80        TAILQ_HEAD_INITIALIZER(internal_list);
  81
  82static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
  83
  84static void update_used_ring(struct ifcvf_internal *internal, uint16_t qid);
  85
  86static struct internal_list *
  87find_internal_resource_by_vdev(struct rte_vdpa_device *vdev)
  88{
  89        int found = 0;
  90        struct internal_list *list;
  91
  92        pthread_mutex_lock(&internal_list_lock);
  93
  94        TAILQ_FOREACH(list, &internal_list, next) {
  95                if (vdev == list->internal->vdev) {
  96                        found = 1;
  97                        break;
  98                }
  99        }
 100
 101        pthread_mutex_unlock(&internal_list_lock);
 102
 103        if (!found)
 104                return NULL;
 105
 106        return list;
 107}
 108
 109static struct internal_list *
 110find_internal_resource_by_dev(struct rte_pci_device *pdev)
 111{
 112        int found = 0;
 113        struct internal_list *list;
 114
 115        pthread_mutex_lock(&internal_list_lock);
 116
 117        TAILQ_FOREACH(list, &internal_list, next) {
 118                if (!rte_pci_addr_cmp(&pdev->addr,
 119                                        &list->internal->pdev->addr)) {
 120                        found = 1;
 121                        break;
 122                }
 123        }
 124
 125        pthread_mutex_unlock(&internal_list_lock);
 126
 127        if (!found)
 128                return NULL;
 129
 130        return list;
 131}
 132
 133static int
 134ifcvf_vfio_setup(struct ifcvf_internal *internal)
 135{
 136        struct rte_pci_device *dev = internal->pdev;
 137        char devname[RTE_DEV_NAME_MAX_LEN] = {0};
 138        int iommu_group_num;
 139        int i, ret;
 140
 141        internal->vfio_dev_fd = -1;
 142        internal->vfio_group_fd = -1;
 143        internal->vfio_container_fd = -1;
 144
 145        rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
 146        ret = rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
 147                        &iommu_group_num);
 148        if (ret <= 0) {
 149                DRV_LOG(ERR, "%s failed to get IOMMU group", devname);
 150                return -1;
 151        }
 152
 153        internal->vfio_container_fd = rte_vfio_container_create();
 154        if (internal->vfio_container_fd < 0)
 155                return -1;
 156
 157        internal->vfio_group_fd = rte_vfio_container_group_bind(
 158                        internal->vfio_container_fd, iommu_group_num);
 159        if (internal->vfio_group_fd < 0)
 160                goto err;
 161
 162        if (rte_pci_map_device(dev))
 163                goto err;
 164
 165        internal->vfio_dev_fd = dev->intr_handle.vfio_dev_fd;
 166
 167        for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
 168                        i++) {
 169                internal->hw.mem_resource[i].addr =
 170                        internal->pdev->mem_resource[i].addr;
 171                internal->hw.mem_resource[i].phys_addr =
 172                        internal->pdev->mem_resource[i].phys_addr;
 173                internal->hw.mem_resource[i].len =
 174                        internal->pdev->mem_resource[i].len;
 175        }
 176
 177        return 0;
 178
 179err:
 180        rte_vfio_container_destroy(internal->vfio_container_fd);
 181        return -1;
 182}
 183
 184static int
 185ifcvf_dma_map(struct ifcvf_internal *internal, int do_map)
 186{
 187        uint32_t i;
 188        int ret;
 189        struct rte_vhost_memory *mem = NULL;
 190        int vfio_container_fd;
 191
 192        ret = rte_vhost_get_mem_table(internal->vid, &mem);
 193        if (ret < 0) {
 194                DRV_LOG(ERR, "failed to get VM memory layout.");
 195                goto exit;
 196        }
 197
 198        vfio_container_fd = internal->vfio_container_fd;
 199
 200        for (i = 0; i < mem->nregions; i++) {
 201                struct rte_vhost_mem_region *reg;
 202
 203                reg = &mem->regions[i];
 204                DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
 205                        "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
 206                        do_map ? "DMA map" : "DMA unmap", i,
 207                        reg->host_user_addr, reg->guest_phys_addr, reg->size);
 208
 209                if (do_map) {
 210                        ret = rte_vfio_container_dma_map(vfio_container_fd,
 211                                reg->host_user_addr, reg->guest_phys_addr,
 212                                reg->size);
 213                        if (ret < 0) {
 214                                DRV_LOG(ERR, "DMA map failed.");
 215                                goto exit;
 216                        }
 217                } else {
 218                        ret = rte_vfio_container_dma_unmap(vfio_container_fd,
 219                                reg->host_user_addr, reg->guest_phys_addr,
 220                                reg->size);
 221                        if (ret < 0) {
 222                                DRV_LOG(ERR, "DMA unmap failed.");
 223                                goto exit;
 224                        }
 225                }
 226        }
 227
 228exit:
 229        if (mem)
 230                free(mem);
 231        return ret;
 232}
 233
 234static uint64_t
 235hva_to_gpa(int vid, uint64_t hva)
 236{
 237        struct rte_vhost_memory *mem = NULL;
 238        struct rte_vhost_mem_region *reg;
 239        uint32_t i;
 240        uint64_t gpa = 0;
 241
 242        if (rte_vhost_get_mem_table(vid, &mem) < 0)
 243                goto exit;
 244
 245        for (i = 0; i < mem->nregions; i++) {
 246                reg = &mem->regions[i];
 247
 248                if (hva >= reg->host_user_addr &&
 249                                hva < reg->host_user_addr + reg->size) {
 250                        gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
 251                        break;
 252                }
 253        }
 254
 255exit:
 256        if (mem)
 257                free(mem);
 258        return gpa;
 259}
 260
 261static int
 262vdpa_ifcvf_start(struct ifcvf_internal *internal)
 263{
 264        struct ifcvf_hw *hw = &internal->hw;
 265        int i, nr_vring;
 266        int vid;
 267        struct rte_vhost_vring vq;
 268        uint64_t gpa;
 269
 270        vid = internal->vid;
 271        nr_vring = rte_vhost_get_vring_num(vid);
 272        rte_vhost_get_negotiated_features(vid, &hw->req_features);
 273
 274        for (i = 0; i < nr_vring; i++) {
 275                rte_vhost_get_vhost_vring(vid, i, &vq);
 276                gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
 277                if (gpa == 0) {
 278                        DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
 279                        return -1;
 280                }
 281                hw->vring[i].desc = gpa;
 282
 283                gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
 284                if (gpa == 0) {
 285                        DRV_LOG(ERR, "Fail to get GPA for available ring.");
 286                        return -1;
 287                }
 288                hw->vring[i].avail = gpa;
 289
 290                gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
 291                if (gpa == 0) {
 292                        DRV_LOG(ERR, "Fail to get GPA for used ring.");
 293                        return -1;
 294                }
 295                hw->vring[i].used = gpa;
 296
 297                hw->vring[i].size = vq.size;
 298                rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
 299                                &hw->vring[i].last_used_idx);
 300        }
 301        hw->nr_vring = i;
 302
 303        return ifcvf_start_hw(&internal->hw);
 304}
 305
 306static void
 307vdpa_ifcvf_stop(struct ifcvf_internal *internal)
 308{
 309        struct ifcvf_hw *hw = &internal->hw;
 310        uint32_t i;
 311        int vid;
 312        uint64_t features = 0;
 313        uint64_t log_base = 0, log_size = 0;
 314        uint64_t len;
 315
 316        vid = internal->vid;
 317        ifcvf_stop_hw(hw);
 318
 319        for (i = 0; i < hw->nr_vring; i++)
 320                rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
 321                                hw->vring[i].last_used_idx);
 322
 323        if (internal->sw_lm)
 324                return;
 325
 326        rte_vhost_get_negotiated_features(vid, &features);
 327        if (RTE_VHOST_NEED_LOG(features)) {
 328                ifcvf_disable_logging(hw);
 329                rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
 330                rte_vfio_container_dma_unmap(internal->vfio_container_fd,
 331                                log_base, IFCVF_LOG_BASE, log_size);
 332                /*
 333                 * IFCVF marks dirty memory pages for only packet buffer,
 334                 * SW helps to mark the used ring as dirty after device stops.
 335                 */
 336                for (i = 0; i < hw->nr_vring; i++) {
 337                        len = IFCVF_USED_RING_LEN(hw->vring[i].size);
 338                        rte_vhost_log_used_vring(vid, i, 0, len);
 339                }
 340        }
 341}
 342
 343#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
 344                sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
 345static int
 346vdpa_enable_vfio_intr(struct ifcvf_internal *internal, bool m_rx)
 347{
 348        int ret;
 349        uint32_t i, nr_vring;
 350        char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
 351        struct vfio_irq_set *irq_set;
 352        int *fd_ptr;
 353        struct rte_vhost_vring vring;
 354        int fd;
 355
 356        vring.callfd = -1;
 357
 358        nr_vring = rte_vhost_get_vring_num(internal->vid);
 359
 360        irq_set = (struct vfio_irq_set *)irq_set_buf;
 361        irq_set->argsz = sizeof(irq_set_buf);
 362        irq_set->count = nr_vring + 1;
 363        irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
 364                         VFIO_IRQ_SET_ACTION_TRIGGER;
 365        irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 366        irq_set->start = 0;
 367        fd_ptr = (int *)&irq_set->data;
 368        fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = internal->pdev->intr_handle.fd;
 369
 370        for (i = 0; i < nr_vring; i++)
 371                internal->intr_fd[i] = -1;
 372
 373        for (i = 0; i < nr_vring; i++) {
 374                rte_vhost_get_vhost_vring(internal->vid, i, &vring);
 375                fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
 376                if ((i & 1) == 0 && m_rx == true) {
 377                        fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
 378                        if (fd < 0) {
 379                                DRV_LOG(ERR, "can't setup eventfd: %s",
 380                                        strerror(errno));
 381                                return -1;
 382                        }
 383                        internal->intr_fd[i] = fd;
 384                        fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd;
 385                }
 386        }
 387
 388        ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 389        if (ret) {
 390                DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
 391                                strerror(errno));
 392                return -1;
 393        }
 394
 395        return 0;
 396}
 397
 398static int
 399vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
 400{
 401        int ret;
 402        uint32_t i, nr_vring;
 403        char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
 404        struct vfio_irq_set *irq_set;
 405
 406        irq_set = (struct vfio_irq_set *)irq_set_buf;
 407        irq_set->argsz = sizeof(irq_set_buf);
 408        irq_set->count = 0;
 409        irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
 410        irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 411        irq_set->start = 0;
 412
 413        nr_vring = rte_vhost_get_vring_num(internal->vid);
 414        for (i = 0; i < nr_vring; i++) {
 415                if (internal->intr_fd[i] >= 0)
 416                        close(internal->intr_fd[i]);
 417                internal->intr_fd[i] = -1;
 418        }
 419
 420        ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 421        if (ret) {
 422                DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
 423                                strerror(errno));
 424                return -1;
 425        }
 426
 427        return 0;
 428}
 429
 430static void *
 431notify_relay(void *arg)
 432{
 433        int i, kickfd, epfd, nfds = 0;
 434        uint32_t qid, q_num;
 435        struct epoll_event events[IFCVF_MAX_QUEUES * 2];
 436        struct epoll_event ev;
 437        uint64_t buf;
 438        int nbytes;
 439        struct rte_vhost_vring vring;
 440        struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
 441        struct ifcvf_hw *hw = &internal->hw;
 442
 443        q_num = rte_vhost_get_vring_num(internal->vid);
 444
 445        epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
 446        if (epfd < 0) {
 447                DRV_LOG(ERR, "failed to create epoll instance.");
 448                return NULL;
 449        }
 450        internal->epfd = epfd;
 451
 452        vring.kickfd = -1;
 453        for (qid = 0; qid < q_num; qid++) {
 454                ev.events = EPOLLIN | EPOLLPRI;
 455                rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
 456                ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
 457                if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
 458                        DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
 459                        return NULL;
 460                }
 461        }
 462
 463        for (;;) {
 464                nfds = epoll_wait(epfd, events, q_num, -1);
 465                if (nfds < 0) {
 466                        if (errno == EINTR)
 467                                continue;
 468                        DRV_LOG(ERR, "epoll_wait return fail\n");
 469                        return NULL;
 470                }
 471
 472                for (i = 0; i < nfds; i++) {
 473                        qid = events[i].data.u32;
 474                        kickfd = (uint32_t)(events[i].data.u64 >> 32);
 475                        do {
 476                                nbytes = read(kickfd, &buf, 8);
 477                                if (nbytes < 0) {
 478                                        if (errno == EINTR ||
 479                                            errno == EWOULDBLOCK ||
 480                                            errno == EAGAIN)
 481                                                continue;
 482                                        DRV_LOG(INFO, "Error reading "
 483                                                "kickfd: %s",
 484                                                strerror(errno));
 485                                }
 486                                break;
 487                        } while (1);
 488
 489                        ifcvf_notify_queue(hw, qid);
 490                }
 491        }
 492
 493        return NULL;
 494}
 495
 496static int
 497setup_notify_relay(struct ifcvf_internal *internal)
 498{
 499        char name[THREAD_NAME_LEN];
 500        int ret;
 501
 502        snprintf(name, sizeof(name), "ifc-notify-%d", internal->vid);
 503        ret = rte_ctrl_thread_create(&internal->tid, name, NULL, notify_relay,
 504                                     (void *)internal);
 505        if (ret != 0) {
 506                DRV_LOG(ERR, "failed to create notify relay pthread.");
 507                return -1;
 508        }
 509
 510        return 0;
 511}
 512
 513static int
 514unset_notify_relay(struct ifcvf_internal *internal)
 515{
 516        void *status;
 517
 518        if (internal->tid) {
 519                pthread_cancel(internal->tid);
 520                pthread_join(internal->tid, &status);
 521        }
 522        internal->tid = 0;
 523
 524        if (internal->epfd >= 0)
 525                close(internal->epfd);
 526        internal->epfd = -1;
 527
 528        return 0;
 529}
 530
 531static int
 532update_datapath(struct ifcvf_internal *internal)
 533{
 534        int ret;
 535
 536        rte_spinlock_lock(&internal->lock);
 537
 538        if (!rte_atomic32_read(&internal->running) &&
 539            (rte_atomic32_read(&internal->started) &&
 540             rte_atomic32_read(&internal->dev_attached))) {
 541                ret = ifcvf_dma_map(internal, 1);
 542                if (ret)
 543                        goto err;
 544
 545                ret = vdpa_enable_vfio_intr(internal, 0);
 546                if (ret)
 547                        goto err;
 548
 549                ret = vdpa_ifcvf_start(internal);
 550                if (ret)
 551                        goto err;
 552
 553                ret = setup_notify_relay(internal);
 554                if (ret)
 555                        goto err;
 556
 557                rte_atomic32_set(&internal->running, 1);
 558        } else if (rte_atomic32_read(&internal->running) &&
 559                   (!rte_atomic32_read(&internal->started) ||
 560                    !rte_atomic32_read(&internal->dev_attached))) {
 561                ret = unset_notify_relay(internal);
 562                if (ret)
 563                        goto err;
 564
 565                vdpa_ifcvf_stop(internal);
 566
 567                ret = vdpa_disable_vfio_intr(internal);
 568                if (ret)
 569                        goto err;
 570
 571                ret = ifcvf_dma_map(internal, 0);
 572                if (ret)
 573                        goto err;
 574
 575                rte_atomic32_set(&internal->running, 0);
 576        }
 577
 578        rte_spinlock_unlock(&internal->lock);
 579        return 0;
 580err:
 581        rte_spinlock_unlock(&internal->lock);
 582        return ret;
 583}
 584
 585static int
 586m_ifcvf_start(struct ifcvf_internal *internal)
 587{
 588        struct ifcvf_hw *hw = &internal->hw;
 589        uint32_t i, nr_vring;
 590        int vid, ret;
 591        struct rte_vhost_vring vq;
 592        void *vring_buf;
 593        uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
 594        uint64_t size;
 595        uint64_t gpa;
 596
 597        memset(&vq, 0, sizeof(vq));
 598        vid = internal->vid;
 599        nr_vring = rte_vhost_get_vring_num(vid);
 600        rte_vhost_get_negotiated_features(vid, &hw->req_features);
 601
 602        for (i = 0; i < nr_vring; i++) {
 603                rte_vhost_get_vhost_vring(vid, i, &vq);
 604
 605                size = RTE_ALIGN_CEIL(vring_size(vq.size, rte_mem_page_size()),
 606                                rte_mem_page_size());
 607                vring_buf = rte_zmalloc("ifcvf", size, rte_mem_page_size());
 608                vring_init(&internal->m_vring[i], vq.size, vring_buf,
 609                                rte_mem_page_size());
 610
 611                ret = rte_vfio_container_dma_map(internal->vfio_container_fd,
 612                        (uint64_t)(uintptr_t)vring_buf, m_vring_iova, size);
 613                if (ret < 0) {
 614                        DRV_LOG(ERR, "mediated vring DMA map failed.");
 615                        goto error;
 616                }
 617
 618                gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
 619                if (gpa == 0) {
 620                        DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
 621                        return -1;
 622                }
 623                hw->vring[i].desc = gpa;
 624
 625                gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
 626                if (gpa == 0) {
 627                        DRV_LOG(ERR, "Fail to get GPA for available ring.");
 628                        return -1;
 629                }
 630                hw->vring[i].avail = gpa;
 631
 632                /* Direct I/O for Tx queue, relay for Rx queue */
 633                if (i & 1) {
 634                        gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
 635                        if (gpa == 0) {
 636                                DRV_LOG(ERR, "Fail to get GPA for used ring.");
 637                                return -1;
 638                        }
 639                        hw->vring[i].used = gpa;
 640                } else {
 641                        hw->vring[i].used = m_vring_iova +
 642                                (char *)internal->m_vring[i].used -
 643                                (char *)internal->m_vring[i].desc;
 644                }
 645
 646                hw->vring[i].size = vq.size;
 647
 648                rte_vhost_get_vring_base(vid, i,
 649                                &internal->m_vring[i].avail->idx,
 650                                &internal->m_vring[i].used->idx);
 651
 652                rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
 653                                &hw->vring[i].last_used_idx);
 654
 655                m_vring_iova += size;
 656        }
 657        hw->nr_vring = nr_vring;
 658
 659        return ifcvf_start_hw(&internal->hw);
 660
 661error:
 662        for (i = 0; i < nr_vring; i++)
 663                if (internal->m_vring[i].desc)
 664                        rte_free(internal->m_vring[i].desc);
 665
 666        return -1;
 667}
 668
 669static int
 670m_ifcvf_stop(struct ifcvf_internal *internal)
 671{
 672        int vid;
 673        uint32_t i;
 674        struct rte_vhost_vring vq;
 675        struct ifcvf_hw *hw = &internal->hw;
 676        uint64_t m_vring_iova = IFCVF_MEDIATED_VRING;
 677        uint64_t size, len;
 678
 679        vid = internal->vid;
 680        ifcvf_stop_hw(hw);
 681
 682        for (i = 0; i < hw->nr_vring; i++) {
 683                /* synchronize remaining new used entries if any */
 684                if ((i & 1) == 0)
 685                        update_used_ring(internal, i);
 686
 687                rte_vhost_get_vhost_vring(vid, i, &vq);
 688                len = IFCVF_USED_RING_LEN(vq.size);
 689                rte_vhost_log_used_vring(vid, i, 0, len);
 690
 691                size = RTE_ALIGN_CEIL(vring_size(vq.size, rte_mem_page_size()),
 692                                rte_mem_page_size());
 693                rte_vfio_container_dma_unmap(internal->vfio_container_fd,
 694                        (uint64_t)(uintptr_t)internal->m_vring[i].desc,
 695                        m_vring_iova, size);
 696
 697                rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
 698                                hw->vring[i].last_used_idx);
 699                rte_free(internal->m_vring[i].desc);
 700                m_vring_iova += size;
 701        }
 702
 703        return 0;
 704}
 705
 706static void
 707update_used_ring(struct ifcvf_internal *internal, uint16_t qid)
 708{
 709        rte_vdpa_relay_vring_used(internal->vid, qid, &internal->m_vring[qid]);
 710        rte_vhost_vring_call(internal->vid, qid);
 711}
 712
 713static void *
 714vring_relay(void *arg)
 715{
 716        int i, vid, epfd, fd, nfds;
 717        struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
 718        struct rte_vhost_vring vring;
 719        uint16_t qid, q_num;
 720        struct epoll_event events[IFCVF_MAX_QUEUES * 4];
 721        struct epoll_event ev;
 722        int nbytes;
 723        uint64_t buf;
 724
 725        vid = internal->vid;
 726        q_num = rte_vhost_get_vring_num(vid);
 727
 728        /* add notify fd and interrupt fd to epoll */
 729        epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
 730        if (epfd < 0) {
 731                DRV_LOG(ERR, "failed to create epoll instance.");
 732                return NULL;
 733        }
 734        internal->epfd = epfd;
 735
 736        vring.kickfd = -1;
 737        for (qid = 0; qid < q_num; qid++) {
 738                ev.events = EPOLLIN | EPOLLPRI;
 739                rte_vhost_get_vhost_vring(vid, qid, &vring);
 740                ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
 741                if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
 742                        DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
 743                        return NULL;
 744                }
 745        }
 746
 747        for (qid = 0; qid < q_num; qid += 2) {
 748                ev.events = EPOLLIN | EPOLLPRI;
 749                /* leave a flag to mark it's for interrupt */
 750                ev.data.u64 = 1 | qid << 1 |
 751                        (uint64_t)internal->intr_fd[qid] << 32;
 752                if (epoll_ctl(epfd, EPOLL_CTL_ADD, internal->intr_fd[qid], &ev)
 753                                < 0) {
 754                        DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
 755                        return NULL;
 756                }
 757                update_used_ring(internal, qid);
 758        }
 759
 760        /* start relay with a first kick */
 761        for (qid = 0; qid < q_num; qid++)
 762                ifcvf_notify_queue(&internal->hw, qid);
 763
 764        /* listen to the events and react accordingly */
 765        for (;;) {
 766                nfds = epoll_wait(epfd, events, q_num * 2, -1);
 767                if (nfds < 0) {
 768                        if (errno == EINTR)
 769                                continue;
 770                        DRV_LOG(ERR, "epoll_wait return fail\n");
 771                        return NULL;
 772                }
 773
 774                for (i = 0; i < nfds; i++) {
 775                        fd = (uint32_t)(events[i].data.u64 >> 32);
 776                        do {
 777                                nbytes = read(fd, &buf, 8);
 778                                if (nbytes < 0) {
 779                                        if (errno == EINTR ||
 780                                            errno == EWOULDBLOCK ||
 781                                            errno == EAGAIN)
 782                                                continue;
 783                                        DRV_LOG(INFO, "Error reading "
 784                                                "kickfd: %s",
 785                                                strerror(errno));
 786                                }
 787                                break;
 788                        } while (1);
 789
 790                        qid = events[i].data.u32 >> 1;
 791
 792                        if (events[i].data.u32 & 1)
 793                                update_used_ring(internal, qid);
 794                        else
 795                                ifcvf_notify_queue(&internal->hw, qid);
 796                }
 797        }
 798
 799        return NULL;
 800}
 801
 802static int
 803setup_vring_relay(struct ifcvf_internal *internal)
 804{
 805        char name[THREAD_NAME_LEN];
 806        int ret;
 807
 808        snprintf(name, sizeof(name), "ifc-vring-%d", internal->vid);
 809        ret = rte_ctrl_thread_create(&internal->tid, name, NULL, vring_relay,
 810                                     (void *)internal);
 811        if (ret != 0) {
 812                DRV_LOG(ERR, "failed to create ring relay pthread.");
 813                return -1;
 814        }
 815
 816        return 0;
 817}
 818
 819static int
 820unset_vring_relay(struct ifcvf_internal *internal)
 821{
 822        void *status;
 823
 824        if (internal->tid) {
 825                pthread_cancel(internal->tid);
 826                pthread_join(internal->tid, &status);
 827        }
 828        internal->tid = 0;
 829
 830        if (internal->epfd >= 0)
 831                close(internal->epfd);
 832        internal->epfd = -1;
 833
 834        return 0;
 835}
 836
 837static int
 838ifcvf_sw_fallback_switchover(struct ifcvf_internal *internal)
 839{
 840        int ret;
 841        int vid = internal->vid;
 842
 843        /* stop the direct IO data path */
 844        unset_notify_relay(internal);
 845        vdpa_ifcvf_stop(internal);
 846        vdpa_disable_vfio_intr(internal);
 847
 848        ret = rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, false);
 849        if (ret && ret != -ENOTSUP)
 850                goto error;
 851
 852        /* set up interrupt for interrupt relay */
 853        ret = vdpa_enable_vfio_intr(internal, 1);
 854        if (ret)
 855                goto unmap;
 856
 857        /* config the VF */
 858        ret = m_ifcvf_start(internal);
 859        if (ret)
 860                goto unset_intr;
 861
 862        /* set up vring relay thread */
 863        ret = setup_vring_relay(internal);
 864        if (ret)
 865                goto stop_vf;
 866
 867        rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true);
 868
 869        internal->sw_fallback_running = true;
 870
 871        return 0;
 872
 873stop_vf:
 874        m_ifcvf_stop(internal);
 875unset_intr:
 876        vdpa_disable_vfio_intr(internal);
 877unmap:
 878        ifcvf_dma_map(internal, 0);
 879error:
 880        return -1;
 881}
 882
 883static int
 884ifcvf_dev_config(int vid)
 885{
 886        struct rte_vdpa_device *vdev;
 887        struct internal_list *list;
 888        struct ifcvf_internal *internal;
 889
 890        vdev = rte_vhost_get_vdpa_device(vid);
 891        list = find_internal_resource_by_vdev(vdev);
 892        if (list == NULL) {
 893                DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
 894                return -1;
 895        }
 896
 897        internal = list->internal;
 898        internal->vid = vid;
 899        rte_atomic32_set(&internal->dev_attached, 1);
 900        update_datapath(internal);
 901
 902        if (rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true) != 0)
 903                DRV_LOG(NOTICE, "vDPA (%s): software relay is used.",
 904                                vdev->device->name);
 905
 906        internal->configured = 1;
 907        return 0;
 908}
 909
 910static int
 911ifcvf_dev_close(int vid)
 912{
 913        struct rte_vdpa_device *vdev;
 914        struct internal_list *list;
 915        struct ifcvf_internal *internal;
 916
 917        vdev = rte_vhost_get_vdpa_device(vid);
 918        list = find_internal_resource_by_vdev(vdev);
 919        if (list == NULL) {
 920                DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
 921                return -1;
 922        }
 923
 924        internal = list->internal;
 925
 926        if (internal->sw_fallback_running) {
 927                /* unset ring relay */
 928                unset_vring_relay(internal);
 929
 930                /* reset VF */
 931                m_ifcvf_stop(internal);
 932
 933                /* remove interrupt setting */
 934                vdpa_disable_vfio_intr(internal);
 935
 936                /* unset DMA map for guest memory */
 937                ifcvf_dma_map(internal, 0);
 938
 939                internal->sw_fallback_running = false;
 940        } else {
 941                rte_atomic32_set(&internal->dev_attached, 0);
 942                update_datapath(internal);
 943        }
 944
 945        internal->configured = 0;
 946        return 0;
 947}
 948
 949static int
 950ifcvf_set_features(int vid)
 951{
 952        uint64_t features = 0;
 953        struct rte_vdpa_device *vdev;
 954        struct internal_list *list;
 955        struct ifcvf_internal *internal;
 956        uint64_t log_base = 0, log_size = 0;
 957
 958        vdev = rte_vhost_get_vdpa_device(vid);
 959        list = find_internal_resource_by_vdev(vdev);
 960        if (list == NULL) {
 961                DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
 962                return -1;
 963        }
 964
 965        internal = list->internal;
 966        rte_vhost_get_negotiated_features(vid, &features);
 967
 968        if (!RTE_VHOST_NEED_LOG(features))
 969                return 0;
 970
 971        if (internal->sw_lm) {
 972                ifcvf_sw_fallback_switchover(internal);
 973        } else {
 974                rte_vhost_get_log_base(vid, &log_base, &log_size);
 975                rte_vfio_container_dma_map(internal->vfio_container_fd,
 976                                log_base, IFCVF_LOG_BASE, log_size);
 977                ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
 978        }
 979
 980        return 0;
 981}
 982
 983static int
 984ifcvf_get_vfio_group_fd(int vid)
 985{
 986        struct rte_vdpa_device *vdev;
 987        struct internal_list *list;
 988
 989        vdev = rte_vhost_get_vdpa_device(vid);
 990        list = find_internal_resource_by_vdev(vdev);
 991        if (list == NULL) {
 992                DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
 993                return -1;
 994        }
 995
 996        return list->internal->vfio_group_fd;
 997}
 998
 999static int
1000ifcvf_get_vfio_device_fd(int vid)
1001{
1002        struct rte_vdpa_device *vdev;
1003        struct internal_list *list;
1004
1005        vdev = rte_vhost_get_vdpa_device(vid);
1006        list = find_internal_resource_by_vdev(vdev);
1007        if (list == NULL) {
1008                DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1009                return -1;
1010        }
1011
1012        return list->internal->vfio_dev_fd;
1013}
1014
1015static int
1016ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
1017{
1018        struct rte_vdpa_device *vdev;
1019        struct internal_list *list;
1020        struct ifcvf_internal *internal;
1021        struct vfio_region_info reg = { .argsz = sizeof(reg) };
1022        int ret;
1023
1024        vdev = rte_vhost_get_vdpa_device(vid);
1025        list = find_internal_resource_by_vdev(vdev);
1026        if (list == NULL) {
1027                DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1028                return -1;
1029        }
1030
1031        internal = list->internal;
1032
1033        reg.index = ifcvf_get_notify_region(&internal->hw);
1034        ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
1035        if (ret) {
1036                DRV_LOG(ERR, "Get not get device region info: %s",
1037                                strerror(errno));
1038                return -1;
1039        }
1040
1041        *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
1042        *size = 0x1000;
1043
1044        return 0;
1045}
1046
1047static int
1048ifcvf_get_queue_num(struct rte_vdpa_device *vdev, uint32_t *queue_num)
1049{
1050        struct internal_list *list;
1051
1052        list = find_internal_resource_by_vdev(vdev);
1053        if (list == NULL) {
1054                DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1055                return -1;
1056        }
1057
1058        *queue_num = list->internal->max_queues;
1059
1060        return 0;
1061}
1062
1063static int
1064ifcvf_get_vdpa_features(struct rte_vdpa_device *vdev, uint64_t *features)
1065{
1066        struct internal_list *list;
1067
1068        list = find_internal_resource_by_vdev(vdev);
1069        if (list == NULL) {
1070                DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1071                return -1;
1072        }
1073
1074        *features = list->internal->features;
1075
1076        return 0;
1077}
1078
1079#define VDPA_SUPPORTED_PROTOCOL_FEATURES \
1080                (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
1081                 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
1082                 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
1083                 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
1084                 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | \
1085                 1ULL << VHOST_USER_PROTOCOL_F_STATUS)
1086static int
1087ifcvf_get_protocol_features(struct rte_vdpa_device *vdev, uint64_t *features)
1088{
1089        RTE_SET_USED(vdev);
1090
1091        *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
1092        return 0;
1093}
1094
1095static int
1096ifcvf_set_vring_state(int vid, int vring, int state)
1097{
1098        struct rte_vdpa_device *vdev;
1099        struct internal_list *list;
1100        struct ifcvf_internal *internal;
1101        struct ifcvf_hw *hw;
1102        struct ifcvf_pci_common_cfg *cfg;
1103        int ret = 0;
1104
1105        vdev = rte_vhost_get_vdpa_device(vid);
1106        list = find_internal_resource_by_vdev(vdev);
1107        if (list == NULL) {
1108                DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
1109                return -1;
1110        }
1111
1112        internal = list->internal;
1113        if (vring < 0 || vring >= internal->max_queues * 2) {
1114                DRV_LOG(ERR, "Vring index %d not correct", vring);
1115                return -1;
1116        }
1117
1118        hw = &internal->hw;
1119        if (!internal->configured)
1120                goto exit;
1121
1122        cfg = hw->common_cfg;
1123        IFCVF_WRITE_REG16(vring, &cfg->queue_select);
1124        IFCVF_WRITE_REG16(!!state, &cfg->queue_enable);
1125
1126        if (!state && hw->vring[vring].enable) {
1127                ret = vdpa_disable_vfio_intr(internal);
1128                if (ret)
1129                        return ret;
1130        }
1131
1132        if (state && !hw->vring[vring].enable) {
1133                ret = vdpa_enable_vfio_intr(internal, 0);
1134                if (ret)
1135                        return ret;
1136        }
1137
1138exit:
1139        hw->vring[vring].enable = !!state;
1140        return 0;
1141}
1142
1143static struct rte_vdpa_dev_ops ifcvf_ops = {
1144        .get_queue_num = ifcvf_get_queue_num,
1145        .get_features = ifcvf_get_vdpa_features,
1146        .get_protocol_features = ifcvf_get_protocol_features,
1147        .dev_conf = ifcvf_dev_config,
1148        .dev_close = ifcvf_dev_close,
1149        .set_vring_state = ifcvf_set_vring_state,
1150        .set_features = ifcvf_set_features,
1151        .migration_done = NULL,
1152        .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
1153        .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
1154        .get_notify_area = ifcvf_get_notify_area,
1155};
1156
1157static inline int
1158open_int(const char *key __rte_unused, const char *value, void *extra_args)
1159{
1160        uint16_t *n = extra_args;
1161
1162        if (value == NULL || extra_args == NULL)
1163                return -EINVAL;
1164
1165        *n = (uint16_t)strtoul(value, NULL, 0);
1166        if (*n == USHRT_MAX && errno == ERANGE)
1167                return -1;
1168
1169        return 0;
1170}
1171
1172static int
1173ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1174                struct rte_pci_device *pci_dev)
1175{
1176        uint64_t features;
1177        struct ifcvf_internal *internal = NULL;
1178        struct internal_list *list = NULL;
1179        int vdpa_mode = 0;
1180        int sw_fallback_lm = 0;
1181        struct rte_kvargs *kvlist = NULL;
1182        int ret = 0;
1183
1184        if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1185                return 0;
1186
1187        if (!pci_dev->device.devargs)
1188                return 1;
1189
1190        kvlist = rte_kvargs_parse(pci_dev->device.devargs->args,
1191                        ifcvf_valid_arguments);
1192        if (kvlist == NULL)
1193                return 1;
1194
1195        /* probe only when vdpa mode is specified */
1196        if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) {
1197                rte_kvargs_free(kvlist);
1198                return 1;
1199        }
1200
1201        ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int,
1202                        &vdpa_mode);
1203        if (ret < 0 || vdpa_mode == 0) {
1204                rte_kvargs_free(kvlist);
1205                return 1;
1206        }
1207
1208        list = rte_zmalloc("ifcvf", sizeof(*list), 0);
1209        if (list == NULL)
1210                goto error;
1211
1212        internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
1213        if (internal == NULL)
1214                goto error;
1215
1216        internal->pdev = pci_dev;
1217        rte_spinlock_init(&internal->lock);
1218
1219        if (ifcvf_vfio_setup(internal) < 0) {
1220                DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
1221                goto error;
1222        }
1223
1224        if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
1225                DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
1226                goto error;
1227        }
1228
1229        internal->configured = 0;
1230        internal->max_queues = IFCVF_MAX_QUEUES;
1231        features = ifcvf_get_features(&internal->hw);
1232        internal->features = (features &
1233                ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
1234                (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
1235                (1ULL << VIRTIO_NET_F_CTRL_VQ) |
1236                (1ULL << VIRTIO_NET_F_STATUS) |
1237                (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
1238                (1ULL << VHOST_F_LOG_ALL);
1239
1240        list->internal = internal;
1241
1242        if (rte_kvargs_count(kvlist, IFCVF_SW_FALLBACK_LM)) {
1243                ret = rte_kvargs_process(kvlist, IFCVF_SW_FALLBACK_LM,
1244                                &open_int, &sw_fallback_lm);
1245                if (ret < 0)
1246                        goto error;
1247        }
1248        internal->sw_lm = sw_fallback_lm;
1249
1250        internal->vdev = rte_vdpa_register_device(&pci_dev->device, &ifcvf_ops);
1251        if (internal->vdev == NULL) {
1252                DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
1253                goto error;
1254        }
1255
1256        pthread_mutex_lock(&internal_list_lock);
1257        TAILQ_INSERT_TAIL(&internal_list, list, next);
1258        pthread_mutex_unlock(&internal_list_lock);
1259
1260        rte_atomic32_set(&internal->started, 1);
1261        update_datapath(internal);
1262
1263        rte_kvargs_free(kvlist);
1264        return 0;
1265
1266error:
1267        rte_kvargs_free(kvlist);
1268        rte_free(list);
1269        rte_free(internal);
1270        return -1;
1271}
1272
1273static int
1274ifcvf_pci_remove(struct rte_pci_device *pci_dev)
1275{
1276        struct ifcvf_internal *internal;
1277        struct internal_list *list;
1278
1279        if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1280                return 0;
1281
1282        list = find_internal_resource_by_dev(pci_dev);
1283        if (list == NULL) {
1284                DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
1285                return -1;
1286        }
1287
1288        internal = list->internal;
1289        rte_atomic32_set(&internal->started, 0);
1290        update_datapath(internal);
1291
1292        rte_pci_unmap_device(internal->pdev);
1293        rte_vfio_container_destroy(internal->vfio_container_fd);
1294        rte_vdpa_unregister_device(internal->vdev);
1295
1296        pthread_mutex_lock(&internal_list_lock);
1297        TAILQ_REMOVE(&internal_list, list, next);
1298        pthread_mutex_unlock(&internal_list_lock);
1299
1300        rte_free(list);
1301        rte_free(internal);
1302
1303        return 0;
1304}
1305
1306/*
1307 * IFCVF has the same vendor ID and device ID as virtio net PCI
1308 * device, with its specific subsystem vendor ID and device ID.
1309 */
1310static const struct rte_pci_id pci_id_ifcvf_map[] = {
1311        { .class_id = RTE_CLASS_ANY_ID,
1312          .vendor_id = IFCVF_VENDOR_ID,
1313          .device_id = IFCVF_DEVICE_ID,
1314          .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
1315          .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
1316        },
1317
1318        { .vendor_id = 0, /* sentinel */
1319        },
1320};
1321
1322static struct rte_pci_driver rte_ifcvf_vdpa = {
1323        .id_table = pci_id_ifcvf_map,
1324        .drv_flags = 0,
1325        .probe = ifcvf_pci_probe,
1326        .remove = ifcvf_pci_remove,
1327};
1328
1329RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
1330RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
1331RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");
1332