dpdk/drivers/bus/pci/linux/pci_vfio.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright(c) 2010-2014 Intel Corporation
   3 */
   4
   5#include <string.h>
   6#include <fcntl.h>
   7#include <linux/pci_regs.h>
   8#include <sys/eventfd.h>
   9#include <sys/socket.h>
  10#include <sys/ioctl.h>
  11#include <sys/mman.h>
  12#include <stdbool.h>
  13
  14#include <rte_log.h>
  15#include <rte_pci.h>
  16#include <rte_bus_pci.h>
  17#include <rte_eal_paging.h>
  18#include <rte_malloc.h>
  19#include <rte_vfio.h>
  20#include <rte_eal.h>
  21#include <rte_bus.h>
  22#include <rte_spinlock.h>
  23#include <rte_tailq.h>
  24
  25#include "eal_filesystem.h"
  26
  27#include "pci_init.h"
  28#include "private.h"
  29
  30/**
  31 * @file
  32 * PCI probing under linux (VFIO version)
  33 *
  34 * This code tries to determine if the PCI device is bound to VFIO driver,
  35 * and initialize it (map BARs, set up interrupts) if that's the case.
  36 *
  37 */
  38
  39#ifdef VFIO_PRESENT
  40
  41static struct rte_tailq_elem rte_vfio_tailq = {
  42        .name = "VFIO_RESOURCE_LIST",
  43};
  44EAL_REGISTER_TAILQ(rte_vfio_tailq)
  45
  46int
  47pci_vfio_read_config(const struct rte_intr_handle *intr_handle,
  48                    void *buf, size_t len, off_t offs)
  49{
  50        return pread64(intr_handle->vfio_dev_fd, buf, len,
  51               VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs);
  52}
  53
  54int
  55pci_vfio_write_config(const struct rte_intr_handle *intr_handle,
  56                    const void *buf, size_t len, off_t offs)
  57{
  58        return pwrite64(intr_handle->vfio_dev_fd, buf, len,
  59               VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + offs);
  60}
  61
  62/* get PCI BAR number where MSI-X interrupts are */
  63static int
  64pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table)
  65{
  66        int ret;
  67        uint32_t reg;
  68        uint16_t flags;
  69        uint8_t cap_id, cap_offset;
  70
  71        /* read PCI capability pointer from config space */
  72        ret = pread64(fd, &reg, sizeof(reg),
  73                        VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
  74                        PCI_CAPABILITY_LIST);
  75        if (ret != sizeof(reg)) {
  76                RTE_LOG(ERR, EAL,
  77                        "Cannot read capability pointer from PCI config space!\n");
  78                return -1;
  79        }
  80
  81        /* we need first byte */
  82        cap_offset = reg & 0xFF;
  83
  84        while (cap_offset) {
  85
  86                /* read PCI capability ID */
  87                ret = pread64(fd, &reg, sizeof(reg),
  88                                VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
  89                                cap_offset);
  90                if (ret != sizeof(reg)) {
  91                        RTE_LOG(ERR, EAL,
  92                                "Cannot read capability ID from PCI config space!\n");
  93                        return -1;
  94                }
  95
  96                /* we need first byte */
  97                cap_id = reg & 0xFF;
  98
  99                /* if we haven't reached MSI-X, check next capability */
 100                if (cap_id != PCI_CAP_ID_MSIX) {
 101                        ret = pread64(fd, &reg, sizeof(reg),
 102                                        VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
 103                                        cap_offset);
 104                        if (ret != sizeof(reg)) {
 105                                RTE_LOG(ERR, EAL,
 106                                        "Cannot read capability pointer from PCI config space!\n");
 107                                return -1;
 108                        }
 109
 110                        /* we need second byte */
 111                        cap_offset = (reg & 0xFF00) >> 8;
 112
 113                        continue;
 114                }
 115                /* else, read table offset */
 116                else {
 117                        /* table offset resides in the next 4 bytes */
 118                        ret = pread64(fd, &reg, sizeof(reg),
 119                                        VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
 120                                        cap_offset + 4);
 121                        if (ret != sizeof(reg)) {
 122                                RTE_LOG(ERR, EAL,
 123                                        "Cannot read table offset from PCI config space!\n");
 124                                return -1;
 125                        }
 126
 127                        ret = pread64(fd, &flags, sizeof(flags),
 128                                        VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
 129                                        cap_offset + 2);
 130                        if (ret != sizeof(flags)) {
 131                                RTE_LOG(ERR, EAL,
 132                                        "Cannot read table flags from PCI config space!\n");
 133                                return -1;
 134                        }
 135
 136                        msix_table->bar_index = reg & RTE_PCI_MSIX_TABLE_BIR;
 137                        msix_table->offset = reg & RTE_PCI_MSIX_TABLE_OFFSET;
 138                        msix_table->size =
 139                                16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE));
 140
 141                        return 0;
 142                }
 143        }
 144        return 0;
 145}
 146
 147/* enable PCI bus memory space */
 148static int
 149pci_vfio_enable_bus_memory(int dev_fd)
 150{
 151        uint16_t cmd;
 152        int ret;
 153
 154        ret = pread64(dev_fd, &cmd, sizeof(cmd),
 155                      VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
 156                      PCI_COMMAND);
 157
 158        if (ret != sizeof(cmd)) {
 159                RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n");
 160                return -1;
 161        }
 162
 163        if (cmd & PCI_COMMAND_MEMORY)
 164                return 0;
 165
 166        cmd |= PCI_COMMAND_MEMORY;
 167        ret = pwrite64(dev_fd, &cmd, sizeof(cmd),
 168                       VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
 169                       PCI_COMMAND);
 170
 171        if (ret != sizeof(cmd)) {
 172                RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n");
 173                return -1;
 174        }
 175
 176        return 0;
 177}
 178
 179/* set PCI bus mastering */
 180static int
 181pci_vfio_set_bus_master(int dev_fd, bool op)
 182{
 183        uint16_t reg;
 184        int ret;
 185
 186        ret = pread64(dev_fd, &reg, sizeof(reg),
 187                        VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
 188                        PCI_COMMAND);
 189        if (ret != sizeof(reg)) {
 190                RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n");
 191                return -1;
 192        }
 193
 194        if (op)
 195                /* set the master bit */
 196                reg |= PCI_COMMAND_MASTER;
 197        else
 198                reg &= ~(PCI_COMMAND_MASTER);
 199
 200        ret = pwrite64(dev_fd, &reg, sizeof(reg),
 201                        VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
 202                        PCI_COMMAND);
 203
 204        if (ret != sizeof(reg)) {
 205                RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n");
 206                return -1;
 207        }
 208
 209        return 0;
 210}
 211
 212/* set up interrupt support (but not enable interrupts) */
 213static int
 214pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd)
 215{
 216        int i, ret, intr_idx;
 217        enum rte_intr_mode intr_mode;
 218
 219        /* default to invalid index */
 220        intr_idx = VFIO_PCI_NUM_IRQS;
 221
 222        /* Get default / configured intr_mode */
 223        intr_mode = rte_eal_vfio_intr_mode();
 224
 225        /* get interrupt type from internal config (MSI-X by default, can be
 226         * overridden from the command line
 227         */
 228        switch (intr_mode) {
 229        case RTE_INTR_MODE_MSIX:
 230                intr_idx = VFIO_PCI_MSIX_IRQ_INDEX;
 231                break;
 232        case RTE_INTR_MODE_MSI:
 233                intr_idx = VFIO_PCI_MSI_IRQ_INDEX;
 234                break;
 235        case RTE_INTR_MODE_LEGACY:
 236                intr_idx = VFIO_PCI_INTX_IRQ_INDEX;
 237                break;
 238        /* don't do anything if we want to automatically determine interrupt type */
 239        case RTE_INTR_MODE_NONE:
 240                break;
 241        default:
 242                RTE_LOG(ERR, EAL, "Unknown default interrupt type!\n");
 243                return -1;
 244        }
 245
 246        /* start from MSI-X interrupt type */
 247        for (i = VFIO_PCI_MSIX_IRQ_INDEX; i >= 0; i--) {
 248                struct vfio_irq_info irq = { .argsz = sizeof(irq) };
 249                int fd = -1;
 250
 251                /* skip interrupt modes we don't want */
 252                if (intr_mode != RTE_INTR_MODE_NONE &&
 253                                i != intr_idx)
 254                        continue;
 255
 256                irq.index = i;
 257
 258                ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq);
 259                if (ret < 0) {
 260                        RTE_LOG(ERR, EAL, "Cannot get VFIO IRQ info, error "
 261                                        "%i (%s)\n", errno, strerror(errno));
 262                        return -1;
 263                }
 264
 265                /* if this vector cannot be used with eventfd, fail if we explicitly
 266                 * specified interrupt type, otherwise continue */
 267                if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) {
 268                        if (intr_mode != RTE_INTR_MODE_NONE) {
 269                                RTE_LOG(ERR, EAL,
 270                                        "Interrupt vector does not support eventfd!\n");
 271                                return -1;
 272                        } else
 273                                continue;
 274                }
 275
 276                /* set up an eventfd for interrupts */
 277                fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
 278                if (fd < 0) {
 279                        RTE_LOG(ERR, EAL, "Cannot set up eventfd, error "
 280                                        "%i (%s)\n", errno, strerror(errno));
 281                        return -1;
 282                }
 283
 284                dev->intr_handle.fd = fd;
 285                dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
 286
 287                switch (i) {
 288                case VFIO_PCI_MSIX_IRQ_INDEX:
 289                        intr_mode = RTE_INTR_MODE_MSIX;
 290                        dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSIX;
 291                        break;
 292                case VFIO_PCI_MSI_IRQ_INDEX:
 293                        intr_mode = RTE_INTR_MODE_MSI;
 294                        dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_MSI;
 295                        break;
 296                case VFIO_PCI_INTX_IRQ_INDEX:
 297                        intr_mode = RTE_INTR_MODE_LEGACY;
 298                        dev->intr_handle.type = RTE_INTR_HANDLE_VFIO_LEGACY;
 299                        break;
 300                default:
 301                        RTE_LOG(ERR, EAL, "Unknown interrupt type!\n");
 302                        return -1;
 303                }
 304
 305                return 0;
 306        }
 307
 308        /* if we're here, we haven't found a suitable interrupt vector */
 309        return -1;
 310}
 311
 312#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
 313/*
 314 * Spinlock for device hot-unplug failure handling.
 315 * If it tries to access bus or device, such as handle sigbus on bus
 316 * or handle memory failure for device, just need to use this lock.
 317 * It could protect the bus and the device to avoid race condition.
 318 */
 319static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER;
 320
 321static void
 322pci_vfio_req_handler(void *param)
 323{
 324        struct rte_bus *bus;
 325        int ret;
 326        struct rte_device *device = (struct rte_device *)param;
 327
 328        rte_spinlock_lock(&failure_handle_lock);
 329        bus = rte_bus_find_by_device(device);
 330        if (bus == NULL) {
 331                RTE_LOG(ERR, EAL, "Cannot find bus for device (%s)\n",
 332                        device->name);
 333                goto handle_end;
 334        }
 335
 336        /*
 337         * vfio kernel module request user space to release allocated
 338         * resources before device be deleted in kernel, so it can directly
 339         * call the vfio bus hot-unplug handler to process it.
 340         */
 341        ret = bus->hot_unplug_handler(device);
 342        if (ret)
 343                RTE_LOG(ERR, EAL,
 344                        "Can not handle hot-unplug for device (%s)\n",
 345                        device->name);
 346handle_end:
 347        rte_spinlock_unlock(&failure_handle_lock);
 348}
 349
 350/* enable notifier (only enable req now) */
 351static int
 352pci_vfio_enable_notifier(struct rte_pci_device *dev, int vfio_dev_fd)
 353{
 354        int ret;
 355        int fd = -1;
 356
 357        /* set up an eventfd for req notifier */
 358        fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
 359        if (fd < 0) {
 360                RTE_LOG(ERR, EAL, "Cannot set up eventfd, error %i (%s)\n",
 361                        errno, strerror(errno));
 362                return -1;
 363        }
 364
 365        dev->vfio_req_intr_handle.fd = fd;
 366        dev->vfio_req_intr_handle.type = RTE_INTR_HANDLE_VFIO_REQ;
 367        dev->vfio_req_intr_handle.vfio_dev_fd = vfio_dev_fd;
 368
 369        ret = rte_intr_callback_register(&dev->vfio_req_intr_handle,
 370                                         pci_vfio_req_handler,
 371                                         (void *)&dev->device);
 372        if (ret) {
 373                RTE_LOG(ERR, EAL, "Fail to register req notifier handler.\n");
 374                goto error;
 375        }
 376
 377        ret = rte_intr_enable(&dev->vfio_req_intr_handle);
 378        if (ret) {
 379                RTE_LOG(ERR, EAL, "Fail to enable req notifier.\n");
 380                ret = rte_intr_callback_unregister(&dev->vfio_req_intr_handle,
 381                                                 pci_vfio_req_handler,
 382                                                 (void *)&dev->device);
 383                if (ret < 0)
 384                        RTE_LOG(ERR, EAL,
 385                                "Fail to unregister req notifier handler.\n");
 386                goto error;
 387        }
 388
 389        return 0;
 390error:
 391        close(fd);
 392
 393        dev->vfio_req_intr_handle.fd = -1;
 394        dev->vfio_req_intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
 395        dev->vfio_req_intr_handle.vfio_dev_fd = -1;
 396
 397        return -1;
 398}
 399
 400/* disable notifier (only disable req now) */
 401static int
 402pci_vfio_disable_notifier(struct rte_pci_device *dev)
 403{
 404        int ret;
 405
 406        ret = rte_intr_disable(&dev->vfio_req_intr_handle);
 407        if (ret) {
 408                RTE_LOG(ERR, EAL, "fail to disable req notifier.\n");
 409                return -1;
 410        }
 411
 412        ret = rte_intr_callback_unregister_sync(&dev->vfio_req_intr_handle,
 413                                           pci_vfio_req_handler,
 414                                           (void *)&dev->device);
 415        if (ret < 0) {
 416                RTE_LOG(ERR, EAL,
 417                         "fail to unregister req notifier handler.\n");
 418                return -1;
 419        }
 420
 421        close(dev->vfio_req_intr_handle.fd);
 422
 423        dev->vfio_req_intr_handle.fd = -1;
 424        dev->vfio_req_intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
 425        dev->vfio_req_intr_handle.vfio_dev_fd = -1;
 426
 427        return 0;
 428}
 429#endif
 430
 431static int
 432pci_vfio_is_ioport_bar(int vfio_dev_fd, int bar_index)
 433{
 434        uint32_t ioport_bar;
 435        int ret;
 436
 437        ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar),
 438                          VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX)
 439                          + PCI_BASE_ADDRESS_0 + bar_index*4);
 440        if (ret != sizeof(ioport_bar)) {
 441                RTE_LOG(ERR, EAL, "Cannot read command (%x) from config space!\n",
 442                        PCI_BASE_ADDRESS_0 + bar_index*4);
 443                return -1;
 444        }
 445
 446        return (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO) != 0;
 447}
 448
 449static int
 450pci_rte_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd)
 451{
 452        if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) {
 453                RTE_LOG(ERR, EAL, "Error setting up interrupts!\n");
 454                return -1;
 455        }
 456
 457        if (pci_vfio_enable_bus_memory(vfio_dev_fd)) {
 458                RTE_LOG(ERR, EAL, "Cannot enable bus memory!\n");
 459                return -1;
 460        }
 461
 462        /* set bus mastering for the device */
 463        if (pci_vfio_set_bus_master(vfio_dev_fd, true)) {
 464                RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n");
 465                return -1;
 466        }
 467
 468        /*
 469         * Reset the device. If the device is not capable of resetting,
 470         * then it updates errno as EINVAL.
 471         */
 472        if (ioctl(vfio_dev_fd, VFIO_DEVICE_RESET) && errno != EINVAL) {
 473                RTE_LOG(ERR, EAL, "Unable to reset device! Error: %d (%s)\n",
 474                                errno, strerror(errno));
 475                return -1;
 476        }
 477
 478        return 0;
 479}
 480
 481static int
 482pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
 483                int bar_index, int additional_flags)
 484{
 485        struct memreg {
 486                uint64_t offset;
 487                size_t   size;
 488        } memreg[2] = {};
 489        void *bar_addr;
 490        struct pci_msix_table *msix_table = &vfio_res->msix_table;
 491        struct pci_map *bar = &vfio_res->maps[bar_index];
 492
 493        if (bar->size == 0) {
 494                RTE_LOG(DEBUG, EAL, "Bar size is 0, skip BAR%d\n", bar_index);
 495                return 0;
 496        }
 497
 498        if (msix_table->bar_index == bar_index) {
 499                /*
 500                 * VFIO will not let us map the MSI-X table,
 501                 * but we can map around it.
 502                 */
 503                uint32_t table_start = msix_table->offset;
 504                uint32_t table_end = table_start + msix_table->size;
 505                table_end = RTE_ALIGN(table_end, rte_mem_page_size());
 506                table_start = RTE_ALIGN_FLOOR(table_start, rte_mem_page_size());
 507
 508                /* If page-aligned start of MSI-X table is less than the
 509                 * actual MSI-X table start address, reassign to the actual
 510                 * start address.
 511                 */
 512                if (table_start < msix_table->offset)
 513                        table_start = msix_table->offset;
 514
 515                if (table_start == 0 && table_end >= bar->size) {
 516                        /* Cannot map this BAR */
 517                        RTE_LOG(DEBUG, EAL, "Skipping BAR%d\n", bar_index);
 518                        bar->size = 0;
 519                        bar->addr = 0;
 520                        return 0;
 521                }
 522
 523                memreg[0].offset = bar->offset;
 524                memreg[0].size = table_start;
 525                if (bar->size < table_end) {
 526                        /*
 527                         * If MSI-X table end is beyond BAR end, don't attempt
 528                         * to perform second mapping.
 529                         */
 530                        memreg[1].offset = 0;
 531                        memreg[1].size = 0;
 532                } else {
 533                        memreg[1].offset = bar->offset + table_end;
 534                        memreg[1].size = bar->size - table_end;
 535                }
 536
 537                RTE_LOG(DEBUG, EAL,
 538                        "Trying to map BAR%d that contains the MSI-X "
 539                        "table. Trying offsets: "
 540                        "0x%04" PRIx64 ":0x%04zx, 0x%04" PRIx64 ":0x%04zx\n",
 541                        bar_index,
 542                        memreg[0].offset, memreg[0].size,
 543                        memreg[1].offset, memreg[1].size);
 544        } else {
 545                memreg[0].offset = bar->offset;
 546                memreg[0].size = bar->size;
 547        }
 548
 549        /* reserve the address using an inaccessible mapping */
 550        bar_addr = mmap(bar->addr, bar->size, 0, MAP_PRIVATE |
 551                        MAP_ANONYMOUS | additional_flags, -1, 0);
 552        if (bar_addr != MAP_FAILED) {
 553                void *map_addr = NULL;
 554                if (memreg[0].size) {
 555                        /* actual map of first part */
 556                        map_addr = pci_map_resource(bar_addr, vfio_dev_fd,
 557                                                        memreg[0].offset,
 558                                                        memreg[0].size,
 559                                                        RTE_MAP_FORCE_ADDRESS);
 560                }
 561
 562                /*
 563                 * Regarding "memreg[0].size == 0":
 564                 * If this BAR has MSI-X table, memreg[0].size (the
 565                 * first part or the part before the table) can
 566                 * legitimately be 0 for hardware using vector table
 567                 * offset 0 (i.e. first part does not exist).
 568                 *
 569                 * When memreg[0].size is 0, "mapping the first part"
 570                 * never happens, and map_addr is NULL at this
 571                 * point. So check that mapping has been actually
 572                 * attempted.
 573                 */
 574                /* if there's a second part, try to map it */
 575                if ((map_addr != NULL || memreg[0].size == 0)
 576                        && memreg[1].offset && memreg[1].size) {
 577                        void *second_addr = RTE_PTR_ADD(bar_addr,
 578                                                (uintptr_t)(memreg[1].offset -
 579                                                bar->offset));
 580                        map_addr = pci_map_resource(second_addr,
 581                                                        vfio_dev_fd,
 582                                                        memreg[1].offset,
 583                                                        memreg[1].size,
 584                                                        RTE_MAP_FORCE_ADDRESS);
 585                }
 586
 587                if (map_addr == NULL) {
 588                        munmap(bar_addr, bar->size);
 589                        bar_addr = MAP_FAILED;
 590                        RTE_LOG(ERR, EAL, "Failed to map pci BAR%d\n",
 591                                        bar_index);
 592                        return -1;
 593                }
 594        } else {
 595                RTE_LOG(ERR, EAL,
 596                                "Failed to create inaccessible mapping for BAR%d\n",
 597                                bar_index);
 598                return -1;
 599        }
 600
 601        bar->addr = bar_addr;
 602        return 0;
 603}
 604
 605/*
 606 * region info may contain capability headers, so we need to keep reallocating
 607 * the memory until we match allocated memory size with argsz.
 608 */
 609static int
 610pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info,
 611                int region)
 612{
 613        struct vfio_region_info *ri;
 614        size_t argsz = sizeof(*ri);
 615        int ret;
 616
 617        ri = malloc(sizeof(*ri));
 618        if (ri == NULL) {
 619                RTE_LOG(ERR, EAL,
 620                        "Cannot allocate memory for VFIO region info\n");
 621                return -1;
 622        }
 623again:
 624        memset(ri, 0, argsz);
 625        ri->argsz = argsz;
 626        ri->index = region;
 627
 628        ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ri);
 629        if (ret < 0) {
 630                free(ri);
 631                return ret;
 632        }
 633        if (ri->argsz != argsz) {
 634                struct vfio_region_info *tmp;
 635
 636                argsz = ri->argsz;
 637                tmp = realloc(ri, argsz);
 638
 639                if (tmp == NULL) {
 640                        /* realloc failed but the ri is still there */
 641                        free(ri);
 642                        RTE_LOG(ERR, EAL,
 643                                "Cannot reallocate memory for VFIO region info\n");
 644                        return -1;
 645                }
 646                ri = tmp;
 647                goto again;
 648        }
 649        *info = ri;
 650
 651        return 0;
 652}
 653
 654static struct vfio_info_cap_header *
 655pci_vfio_info_cap(struct vfio_region_info *info, int cap)
 656{
 657        struct vfio_info_cap_header *h;
 658        size_t offset;
 659
 660        if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) {
 661                /* VFIO info does not advertise capabilities */
 662                return NULL;
 663        }
 664
 665        offset = VFIO_CAP_OFFSET(info);
 666        while (offset != 0) {
 667                h = RTE_PTR_ADD(info, offset);
 668                if (h->id == cap)
 669                        return h;
 670                offset = h->next;
 671        }
 672        return NULL;
 673}
 674
 675static int
 676pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region)
 677{
 678        struct vfio_region_info *info;
 679        int ret;
 680
 681        ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region);
 682        if (ret < 0)
 683                return -1;
 684
 685        ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL;
 686
 687        /* cleanup */
 688        free(info);
 689
 690        return ret;
 691}
 692
 693
 694static int
 695pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 696{
 697        struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
 698        char pci_addr[PATH_MAX] = {0};
 699        int vfio_dev_fd;
 700        struct rte_pci_addr *loc = &dev->addr;
 701        int i, ret;
 702        struct mapped_pci_resource *vfio_res = NULL;
 703        struct mapped_pci_res_list *vfio_res_list =
 704                RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
 705
 706        struct pci_map *maps;
 707
 708        dev->intr_handle.fd = -1;
 709#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
 710        dev->vfio_req_intr_handle.fd = -1;
 711#endif
 712
 713        /* store PCI address string */
 714        snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
 715                        loc->domain, loc->bus, loc->devid, loc->function);
 716
 717        ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
 718                                        &vfio_dev_fd, &device_info);
 719        if (ret)
 720                return ret;
 721
 722        /* allocate vfio_res and get region info */
 723        vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0);
 724        if (vfio_res == NULL) {
 725                RTE_LOG(ERR, EAL,
 726                        "Cannot store VFIO mmap details\n");
 727                goto err_vfio_dev_fd;
 728        }
 729        memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr));
 730
 731        /* get number of registers (up to BAR5) */
 732        vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions,
 733                        VFIO_PCI_BAR5_REGION_INDEX + 1);
 734
 735        /* map BARs */
 736        maps = vfio_res->maps;
 737
 738        vfio_res->msix_table.bar_index = -1;
 739        /* get MSI-X BAR, if any (we have to know where it is because we can't
 740         * easily mmap it when using VFIO)
 741         */
 742        ret = pci_vfio_get_msix_bar(vfio_dev_fd, &vfio_res->msix_table);
 743        if (ret < 0) {
 744                RTE_LOG(ERR, EAL, "%s cannot get MSI-X BAR number!\n",
 745                                pci_addr);
 746                goto err_vfio_res;
 747        }
 748        /* if we found our MSI-X BAR region, check if we can mmap it */
 749        if (vfio_res->msix_table.bar_index != -1) {
 750                int ret = pci_vfio_msix_is_mappable(vfio_dev_fd,
 751                                vfio_res->msix_table.bar_index);
 752                if (ret < 0) {
 753                        RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n");
 754                        goto err_vfio_res;
 755                } else if (ret != 0) {
 756                        /* we can map it, so we don't care where it is */
 757                        RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n");
 758                        vfio_res->msix_table.bar_index = -1;
 759                }
 760        }
 761
 762        for (i = 0; i < vfio_res->nb_maps; i++) {
 763                struct vfio_region_info *reg = NULL;
 764                void *bar_addr;
 765
 766                ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
 767                if (ret < 0) {
 768                        RTE_LOG(ERR, EAL,
 769                                "%s cannot get device region info error "
 770                                "%i (%s)\n", pci_addr, errno, strerror(errno));
 771                        goto err_vfio_res;
 772                }
 773
 774                /* chk for io port region */
 775                ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i);
 776                if (ret < 0) {
 777                        free(reg);
 778                        goto err_vfio_res;
 779                } else if (ret) {
 780                        RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n",
 781                                        i);
 782                        free(reg);
 783                        continue;
 784                }
 785
 786                /* skip non-mmapable BARs */
 787                if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) {
 788                        free(reg);
 789                        continue;
 790                }
 791
 792                /* try mapping somewhere close to the end of hugepages */
 793                if (pci_map_addr == NULL)
 794                        pci_map_addr = pci_find_max_end_va();
 795
 796                bar_addr = pci_map_addr;
 797                pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size);
 798
 799                pci_map_addr = RTE_PTR_ALIGN(pci_map_addr,
 800                                        sysconf(_SC_PAGE_SIZE));
 801
 802                maps[i].addr = bar_addr;
 803                maps[i].offset = reg->offset;
 804                maps[i].size = reg->size;
 805                maps[i].path = NULL; /* vfio doesn't have per-resource paths */
 806
 807                ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
 808                if (ret < 0) {
 809                        RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n",
 810                                        pci_addr, i, strerror(errno));
 811                        free(reg);
 812                        goto err_vfio_res;
 813                }
 814
 815                dev->mem_resource[i].addr = maps[i].addr;
 816
 817                free(reg);
 818        }
 819
 820        if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) {
 821                RTE_LOG(ERR, EAL, "%s setup device failed\n", pci_addr);
 822                goto err_vfio_res;
 823        }
 824
 825#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
 826        if (pci_vfio_enable_notifier(dev, vfio_dev_fd) != 0) {
 827                RTE_LOG(ERR, EAL, "Error setting up notifier!\n");
 828                goto err_vfio_res;
 829        }
 830
 831#endif
 832        TAILQ_INSERT_TAIL(vfio_res_list, vfio_res, next);
 833
 834        return 0;
 835err_vfio_res:
 836        rte_free(vfio_res);
 837err_vfio_dev_fd:
 838        rte_vfio_release_device(rte_pci_get_sysfs_path(),
 839                        pci_addr, vfio_dev_fd);
 840        return -1;
 841}
 842
 843static int
 844pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
 845{
 846        struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
 847        char pci_addr[PATH_MAX] = {0};
 848        int vfio_dev_fd;
 849        struct rte_pci_addr *loc = &dev->addr;
 850        int i, ret;
 851        struct mapped_pci_resource *vfio_res = NULL;
 852        struct mapped_pci_res_list *vfio_res_list =
 853                RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
 854
 855        struct pci_map *maps;
 856
 857        dev->intr_handle.fd = -1;
 858#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
 859        dev->vfio_req_intr_handle.fd = -1;
 860#endif
 861
 862        /* store PCI address string */
 863        snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
 864                        loc->domain, loc->bus, loc->devid, loc->function);
 865
 866        /* if we're in a secondary process, just find our tailq entry */
 867        TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
 868                if (rte_pci_addr_cmp(&vfio_res->pci_addr,
 869                                                 &dev->addr))
 870                        continue;
 871                break;
 872        }
 873        /* if we haven't found our tailq entry, something's wrong */
 874        if (vfio_res == NULL) {
 875                RTE_LOG(ERR, EAL, "%s cannot find TAILQ entry for PCI device!\n",
 876                                pci_addr);
 877                return -1;
 878        }
 879
 880        ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
 881                                        &vfio_dev_fd, &device_info);
 882        if (ret)
 883                return ret;
 884
 885        /* map BARs */
 886        maps = vfio_res->maps;
 887
 888        for (i = 0; i < vfio_res->nb_maps; i++) {
 889                ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, MAP_FIXED);
 890                if (ret < 0) {
 891                        RTE_LOG(ERR, EAL, "%s mapping BAR%i failed: %s\n",
 892                                        pci_addr, i, strerror(errno));
 893                        goto err_vfio_dev_fd;
 894                }
 895
 896                dev->mem_resource[i].addr = maps[i].addr;
 897        }
 898
 899        /* we need save vfio_dev_fd, so it can be used during release */
 900        dev->intr_handle.vfio_dev_fd = vfio_dev_fd;
 901#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
 902        dev->vfio_req_intr_handle.vfio_dev_fd = vfio_dev_fd;
 903#endif
 904
 905        return 0;
 906err_vfio_dev_fd:
 907        rte_vfio_release_device(rte_pci_get_sysfs_path(),
 908                        pci_addr, vfio_dev_fd);
 909        return -1;
 910}
 911
 912/*
 913 * map the PCI resources of a PCI device in virtual memory (VFIO version).
 914 * primary and secondary processes follow almost exactly the same path
 915 */
 916int
 917pci_vfio_map_resource(struct rte_pci_device *dev)
 918{
 919        if (rte_eal_process_type() == RTE_PROC_PRIMARY)
 920                return pci_vfio_map_resource_primary(dev);
 921        else
 922                return pci_vfio_map_resource_secondary(dev);
 923}
 924
 925static struct mapped_pci_resource *
 926find_and_unmap_vfio_resource(struct mapped_pci_res_list *vfio_res_list,
 927                        struct rte_pci_device *dev,
 928                        const char *pci_addr)
 929{
 930        struct mapped_pci_resource *vfio_res = NULL;
 931        struct pci_map *maps;
 932        int i;
 933
 934        /* Get vfio_res */
 935        TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
 936                if (rte_pci_addr_cmp(&vfio_res->pci_addr, &dev->addr))
 937                        continue;
 938                break;
 939        }
 940
 941        if  (vfio_res == NULL)
 942                return vfio_res;
 943
 944        RTE_LOG(INFO, EAL, "Releasing PCI mapped resource for %s\n",
 945                pci_addr);
 946
 947        maps = vfio_res->maps;
 948        for (i = 0; i < vfio_res->nb_maps; i++) {
 949
 950                /*
 951                 * We do not need to be aware of MSI-X table BAR mappings as
 952                 * when mapping. Just using current maps array is enough
 953                 */
 954                if (maps[i].addr) {
 955                        RTE_LOG(INFO, EAL, "Calling pci_unmap_resource for %s at %p\n",
 956                                pci_addr, maps[i].addr);
 957                        pci_unmap_resource(maps[i].addr, maps[i].size);
 958                }
 959        }
 960
 961        return vfio_res;
 962}
 963
 964static int
 965pci_vfio_unmap_resource_primary(struct rte_pci_device *dev)
 966{
 967        char pci_addr[PATH_MAX] = {0};
 968        struct rte_pci_addr *loc = &dev->addr;
 969        struct mapped_pci_resource *vfio_res = NULL;
 970        struct mapped_pci_res_list *vfio_res_list;
 971        int ret;
 972
 973        /* store PCI address string */
 974        snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
 975                        loc->domain, loc->bus, loc->devid, loc->function);
 976
 977#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
 978        ret = pci_vfio_disable_notifier(dev);
 979        if (ret) {
 980                RTE_LOG(ERR, EAL, "fail to disable req notifier.\n");
 981                return -1;
 982        }
 983
 984#endif
 985        if (close(dev->intr_handle.fd) < 0) {
 986                RTE_LOG(INFO, EAL, "Error when closing eventfd file descriptor for %s\n",
 987                        pci_addr);
 988                return -1;
 989        }
 990
 991        if (pci_vfio_set_bus_master(dev->intr_handle.vfio_dev_fd, false)) {
 992                RTE_LOG(ERR, EAL, "%s cannot unset bus mastering for PCI device!\n",
 993                                pci_addr);
 994                return -1;
 995        }
 996
 997        ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr,
 998                                  dev->intr_handle.vfio_dev_fd);
 999        if (ret < 0) {
1000                RTE_LOG(ERR, EAL, "Cannot release VFIO device\n");
1001                return ret;
1002        }
1003
1004        vfio_res_list =
1005                RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
1006        vfio_res = find_and_unmap_vfio_resource(vfio_res_list, dev, pci_addr);
1007
1008        /* if we haven't found our tailq entry, something's wrong */
1009        if (vfio_res == NULL) {
1010                RTE_LOG(ERR, EAL, "%s cannot find TAILQ entry for PCI device!\n",
1011                                pci_addr);
1012                return -1;
1013        }
1014
1015        TAILQ_REMOVE(vfio_res_list, vfio_res, next);
1016        rte_free(vfio_res);
1017        return 0;
1018}
1019
1020static int
1021pci_vfio_unmap_resource_secondary(struct rte_pci_device *dev)
1022{
1023        char pci_addr[PATH_MAX] = {0};
1024        struct rte_pci_addr *loc = &dev->addr;
1025        struct mapped_pci_resource *vfio_res = NULL;
1026        struct mapped_pci_res_list *vfio_res_list;
1027        int ret;
1028
1029        /* store PCI address string */
1030        snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
1031                        loc->domain, loc->bus, loc->devid, loc->function);
1032
1033        ret = rte_vfio_release_device(rte_pci_get_sysfs_path(), pci_addr,
1034                                  dev->intr_handle.vfio_dev_fd);
1035        if (ret < 0) {
1036                RTE_LOG(ERR, EAL, "Cannot release VFIO device\n");
1037                return ret;
1038        }
1039
1040        vfio_res_list =
1041                RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
1042        vfio_res = find_and_unmap_vfio_resource(vfio_res_list, dev, pci_addr);
1043
1044        /* if we haven't found our tailq entry, something's wrong */
1045        if (vfio_res == NULL) {
1046                RTE_LOG(ERR, EAL, "%s cannot find TAILQ entry for PCI device!\n",
1047                                pci_addr);
1048                return -1;
1049        }
1050
1051        return 0;
1052}
1053
1054int
1055pci_vfio_unmap_resource(struct rte_pci_device *dev)
1056{
1057        if (rte_eal_process_type() == RTE_PROC_PRIMARY)
1058                return pci_vfio_unmap_resource_primary(dev);
1059        else
1060                return pci_vfio_unmap_resource_secondary(dev);
1061}
1062
1063int
1064pci_vfio_ioport_map(struct rte_pci_device *dev, int bar,
1065                    struct rte_pci_ioport *p)
1066{
1067        if (bar < VFIO_PCI_BAR0_REGION_INDEX ||
1068            bar > VFIO_PCI_BAR5_REGION_INDEX) {
1069                RTE_LOG(ERR, EAL, "invalid bar (%d)!\n", bar);
1070                return -1;
1071        }
1072
1073        p->dev = dev;
1074        p->base = VFIO_GET_REGION_ADDR(bar);
1075        return 0;
1076}
1077
1078void
1079pci_vfio_ioport_read(struct rte_pci_ioport *p,
1080                     void *data, size_t len, off_t offset)
1081{
1082        const struct rte_intr_handle *intr_handle = &p->dev->intr_handle;
1083
1084        if (pread64(intr_handle->vfio_dev_fd, data,
1085                    len, p->base + offset) <= 0)
1086                RTE_LOG(ERR, EAL,
1087                        "Can't read from PCI bar (%" PRIu64 ") : offset (%x)\n",
1088                        VFIO_GET_REGION_IDX(p->base), (int)offset);
1089}
1090
1091void
1092pci_vfio_ioport_write(struct rte_pci_ioport *p,
1093                      const void *data, size_t len, off_t offset)
1094{
1095        const struct rte_intr_handle *intr_handle = &p->dev->intr_handle;
1096
1097        if (pwrite64(intr_handle->vfio_dev_fd, data,
1098                     len, p->base + offset) <= 0)
1099                RTE_LOG(ERR, EAL,
1100                        "Can't write to PCI bar (%" PRIu64 ") : offset (%x)\n",
1101                        VFIO_GET_REGION_IDX(p->base), (int)offset);
1102}
1103
1104int
1105pci_vfio_ioport_unmap(struct rte_pci_ioport *p)
1106{
1107        RTE_SET_USED(p);
1108        return -1;
1109}
1110
1111int
1112pci_vfio_is_enabled(void)
1113{
1114        return rte_vfio_is_enabled("vfio_pci");
1115}
1116#endif
1117