dpdk/lib/librte_eal/linux/eal_interrupts.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright(c) 2010-2014 Intel Corporation
   3 */
   4
   5#include <stdio.h>
   6#include <stdint.h>
   7#include <stdlib.h>
   8#include <pthread.h>
   9#include <sys/queue.h>
  10#include <stdarg.h>
  11#include <unistd.h>
  12#include <string.h>
  13#include <errno.h>
  14#include <inttypes.h>
  15#include <sys/epoll.h>
  16#include <sys/signalfd.h>
  17#include <sys/ioctl.h>
  18#include <sys/eventfd.h>
  19#include <assert.h>
  20#include <stdbool.h>
  21
  22#include <rte_common.h>
  23#include <rte_interrupts.h>
  24#include <rte_memory.h>
  25#include <rte_launch.h>
  26#include <rte_eal.h>
  27#include <rte_per_lcore.h>
  28#include <rte_lcore.h>
  29#include <rte_branch_prediction.h>
  30#include <rte_debug.h>
  31#include <rte_log.h>
  32#include <rte_errno.h>
  33#include <rte_spinlock.h>
  34#include <rte_pause.h>
  35#include <rte_vfio.h>
  36#include <rte_eal_trace.h>
  37
  38#include "eal_private.h"
  39#include "eal_vfio.h"
  40#include "eal_thread.h"
  41
  42#define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
  43#define NB_OTHER_INTR               1
  44
  45static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
  46
  47/**
  48 * union for pipe fds.
  49 */
  50union intr_pipefds{
  51        struct {
  52                int pipefd[2];
  53        };
  54        struct {
  55                int readfd;
  56                int writefd;
  57        };
  58};
  59
  60/**
  61 * union buffer for reading on different devices
  62 */
  63union rte_intr_read_buffer {
  64        int uio_intr_count;              /* for uio device */
  65#ifdef VFIO_PRESENT
  66        uint64_t vfio_intr_count;        /* for vfio device */
  67#endif
  68        uint64_t timerfd_num;            /* for timerfd */
  69        char charbuf[16];                /* for others */
  70};
  71
  72TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
  73TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
  74
  75struct rte_intr_callback {
  76        TAILQ_ENTRY(rte_intr_callback) next;
  77        rte_intr_callback_fn cb_fn;  /**< callback address */
  78        void *cb_arg;                /**< parameter for callback */
  79        uint8_t pending_delete;      /**< delete after callback is called */
  80        rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */
  81};
  82
  83struct rte_intr_source {
  84        TAILQ_ENTRY(rte_intr_source) next;
  85        struct rte_intr_handle intr_handle; /**< interrupt handle */
  86        struct rte_intr_cb_list callbacks;  /**< user callbacks */
  87        uint32_t active;
  88};
  89
  90/* global spinlock for interrupt data operation */
  91static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
  92
  93/* union buffer for pipe read/write */
  94static union intr_pipefds intr_pipe;
  95
  96/* interrupt sources list */
  97static struct rte_intr_source_list intr_sources;
  98
  99/* interrupt handling thread */
 100static pthread_t intr_thread;
 101
 102/* VFIO interrupts */
 103#ifdef VFIO_PRESENT
 104
 105#define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
 106/* irq set buffer length for queue interrupts and LSC interrupt */
 107#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
 108                              sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
 109
 110/* enable legacy (INTx) interrupts */
 111static int
 112vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
 113        struct vfio_irq_set *irq_set;
 114        char irq_set_buf[IRQ_SET_BUF_LEN];
 115        int len, ret;
 116        int *fd_ptr;
 117
 118        len = sizeof(irq_set_buf);
 119
 120        /* enable INTx */
 121        irq_set = (struct vfio_irq_set *) irq_set_buf;
 122        irq_set->argsz = len;
 123        irq_set->count = 1;
 124        irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 125        irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
 126        irq_set->start = 0;
 127        fd_ptr = (int *) &irq_set->data;
 128        *fd_ptr = intr_handle->fd;
 129
 130        ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 131
 132        if (ret) {
 133                RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
 134                                                intr_handle->fd);
 135                return -1;
 136        }
 137
 138        /* unmask INTx after enabling */
 139        memset(irq_set, 0, len);
 140        len = sizeof(struct vfio_irq_set);
 141        irq_set->argsz = len;
 142        irq_set->count = 1;
 143        irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
 144        irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
 145        irq_set->start = 0;
 146
 147        ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 148
 149        if (ret) {
 150                RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
 151                                                intr_handle->fd);
 152                return -1;
 153        }
 154        return 0;
 155}
 156
 157/* disable legacy (INTx) interrupts */
 158static int
 159vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
 160        struct vfio_irq_set *irq_set;
 161        char irq_set_buf[IRQ_SET_BUF_LEN];
 162        int len, ret;
 163
 164        len = sizeof(struct vfio_irq_set);
 165
 166        /* mask interrupts before disabling */
 167        irq_set = (struct vfio_irq_set *) irq_set_buf;
 168        irq_set->argsz = len;
 169        irq_set->count = 1;
 170        irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
 171        irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
 172        irq_set->start = 0;
 173
 174        ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 175
 176        if (ret) {
 177                RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
 178                                                intr_handle->fd);
 179                return -1;
 180        }
 181
 182        /* disable INTx*/
 183        memset(irq_set, 0, len);
 184        irq_set->argsz = len;
 185        irq_set->count = 0;
 186        irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
 187        irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
 188        irq_set->start = 0;
 189
 190        ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 191
 192        if (ret) {
 193                RTE_LOG(ERR, EAL,
 194                        "Error disabling INTx interrupts for fd %d\n", intr_handle->fd);
 195                return -1;
 196        }
 197        return 0;
 198}
 199
 200/* unmask/ack legacy (INTx) interrupts */
 201static int
 202vfio_ack_intx(const struct rte_intr_handle *intr_handle)
 203{
 204        struct vfio_irq_set irq_set;
 205
 206        /* unmask INTx */
 207        memset(&irq_set, 0, sizeof(irq_set));
 208        irq_set.argsz = sizeof(irq_set);
 209        irq_set.count = 1;
 210        irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
 211        irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
 212        irq_set.start = 0;
 213
 214        if (ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set)) {
 215                RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
 216                        intr_handle->fd);
 217                return -1;
 218        }
 219        return 0;
 220}
 221
 222/* enable MSI interrupts */
 223static int
 224vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
 225        int len, ret;
 226        char irq_set_buf[IRQ_SET_BUF_LEN];
 227        struct vfio_irq_set *irq_set;
 228        int *fd_ptr;
 229
 230        len = sizeof(irq_set_buf);
 231
 232        irq_set = (struct vfio_irq_set *) irq_set_buf;
 233        irq_set->argsz = len;
 234        irq_set->count = 1;
 235        irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 236        irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
 237        irq_set->start = 0;
 238        fd_ptr = (int *) &irq_set->data;
 239        *fd_ptr = intr_handle->fd;
 240
 241        ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 242
 243        if (ret) {
 244                RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
 245                                                intr_handle->fd);
 246                return -1;
 247        }
 248        return 0;
 249}
 250
 251/* disable MSI interrupts */
 252static int
 253vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
 254        struct vfio_irq_set *irq_set;
 255        char irq_set_buf[IRQ_SET_BUF_LEN];
 256        int len, ret;
 257
 258        len = sizeof(struct vfio_irq_set);
 259
 260        irq_set = (struct vfio_irq_set *) irq_set_buf;
 261        irq_set->argsz = len;
 262        irq_set->count = 0;
 263        irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
 264        irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
 265        irq_set->start = 0;
 266
 267        ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 268
 269        if (ret)
 270                RTE_LOG(ERR, EAL,
 271                        "Error disabling MSI interrupts for fd %d\n", intr_handle->fd);
 272
 273        return ret;
 274}
 275
 276/* enable MSI-X interrupts */
 277static int
 278vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
 279        int len, ret;
 280        char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
 281        struct vfio_irq_set *irq_set;
 282        int *fd_ptr;
 283
 284        len = sizeof(irq_set_buf);
 285
 286        irq_set = (struct vfio_irq_set *) irq_set_buf;
 287        irq_set->argsz = len;
 288        /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
 289        irq_set->count = intr_handle->max_intr ?
 290                (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ?
 291                RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1;
 292        irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 293        irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 294        irq_set->start = 0;
 295        fd_ptr = (int *) &irq_set->data;
 296        /* INTR vector offset 0 reserve for non-efds mapping */
 297        fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd;
 298        memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds,
 299                sizeof(*intr_handle->efds) * intr_handle->nb_efd);
 300
 301        ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 302
 303        if (ret) {
 304                RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
 305                                                intr_handle->fd);
 306                return -1;
 307        }
 308
 309        return 0;
 310}
 311
 312/* disable MSI-X interrupts */
 313static int
 314vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
 315        struct vfio_irq_set *irq_set;
 316        char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
 317        int len, ret;
 318
 319        len = sizeof(struct vfio_irq_set);
 320
 321        irq_set = (struct vfio_irq_set *) irq_set_buf;
 322        irq_set->argsz = len;
 323        irq_set->count = 0;
 324        irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
 325        irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 326        irq_set->start = 0;
 327
 328        ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 329
 330        if (ret)
 331                RTE_LOG(ERR, EAL,
 332                        "Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd);
 333
 334        return ret;
 335}
 336
 337#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
 338/* enable req notifier */
 339static int
 340vfio_enable_req(const struct rte_intr_handle *intr_handle)
 341{
 342        int len, ret;
 343        char irq_set_buf[IRQ_SET_BUF_LEN];
 344        struct vfio_irq_set *irq_set;
 345        int *fd_ptr;
 346
 347        len = sizeof(irq_set_buf);
 348
 349        irq_set = (struct vfio_irq_set *) irq_set_buf;
 350        irq_set->argsz = len;
 351        irq_set->count = 1;
 352        irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
 353                         VFIO_IRQ_SET_ACTION_TRIGGER;
 354        irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
 355        irq_set->start = 0;
 356        fd_ptr = (int *) &irq_set->data;
 357        *fd_ptr = intr_handle->fd;
 358
 359        ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 360
 361        if (ret) {
 362                RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
 363                                                intr_handle->fd);
 364                return -1;
 365        }
 366
 367        return 0;
 368}
 369
 370/* disable req notifier */
 371static int
 372vfio_disable_req(const struct rte_intr_handle *intr_handle)
 373{
 374        struct vfio_irq_set *irq_set;
 375        char irq_set_buf[IRQ_SET_BUF_LEN];
 376        int len, ret;
 377
 378        len = sizeof(struct vfio_irq_set);
 379
 380        irq_set = (struct vfio_irq_set *) irq_set_buf;
 381        irq_set->argsz = len;
 382        irq_set->count = 0;
 383        irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
 384        irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
 385        irq_set->start = 0;
 386
 387        ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 388
 389        if (ret)
 390                RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
 391                        intr_handle->fd);
 392
 393        return ret;
 394}
 395#endif
 396#endif
 397
 398static int
 399uio_intx_intr_disable(const struct rte_intr_handle *intr_handle)
 400{
 401        unsigned char command_high;
 402
 403        /* use UIO config file descriptor for uio_pci_generic */
 404        if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
 405                RTE_LOG(ERR, EAL,
 406                        "Error reading interrupts status for fd %d\n",
 407                        intr_handle->uio_cfg_fd);
 408                return -1;
 409        }
 410        /* disable interrupts */
 411        command_high |= 0x4;
 412        if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
 413                RTE_LOG(ERR, EAL,
 414                        "Error disabling interrupts for fd %d\n",
 415                        intr_handle->uio_cfg_fd);
 416                return -1;
 417        }
 418
 419        return 0;
 420}
 421
 422static int
 423uio_intx_intr_enable(const struct rte_intr_handle *intr_handle)
 424{
 425        unsigned char command_high;
 426
 427        /* use UIO config file descriptor for uio_pci_generic */
 428        if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
 429                RTE_LOG(ERR, EAL,
 430                        "Error reading interrupts status for fd %d\n",
 431                        intr_handle->uio_cfg_fd);
 432                return -1;
 433        }
 434        /* enable interrupts */
 435        command_high &= ~0x4;
 436        if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
 437                RTE_LOG(ERR, EAL,
 438                        "Error enabling interrupts for fd %d\n",
 439                        intr_handle->uio_cfg_fd);
 440                return -1;
 441        }
 442
 443        return 0;
 444}
 445
 446static int
 447uio_intr_disable(const struct rte_intr_handle *intr_handle)
 448{
 449        const int value = 0;
 450
 451        if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
 452                RTE_LOG(ERR, EAL,
 453                        "Error disabling interrupts for fd %d (%s)\n",
 454                        intr_handle->fd, strerror(errno));
 455                return -1;
 456        }
 457        return 0;
 458}
 459
 460static int
 461uio_intr_enable(const struct rte_intr_handle *intr_handle)
 462{
 463        const int value = 1;
 464
 465        if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
 466                RTE_LOG(ERR, EAL,
 467                        "Error enabling interrupts for fd %d (%s)\n",
 468                        intr_handle->fd, strerror(errno));
 469                return -1;
 470        }
 471        return 0;
 472}
 473
 474int
 475rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
 476                        rte_intr_callback_fn cb, void *cb_arg)
 477{
 478        int ret, wake_thread;
 479        struct rte_intr_source *src;
 480        struct rte_intr_callback *callback;
 481
 482        wake_thread = 0;
 483
 484        /* first do parameter checking */
 485        if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) {
 486                RTE_LOG(ERR, EAL,
 487                        "Registering with invalid input parameter\n");
 488                return -EINVAL;
 489        }
 490
 491        /* allocate a new interrupt callback entity */
 492        callback = calloc(1, sizeof(*callback));
 493        if (callback == NULL) {
 494                RTE_LOG(ERR, EAL, "Can not allocate memory\n");
 495                return -ENOMEM;
 496        }
 497        callback->cb_fn = cb;
 498        callback->cb_arg = cb_arg;
 499        callback->pending_delete = 0;
 500        callback->ucb_fn = NULL;
 501
 502        rte_spinlock_lock(&intr_lock);
 503
 504        /* check if there is at least one callback registered for the fd */
 505        TAILQ_FOREACH(src, &intr_sources, next) {
 506                if (src->intr_handle.fd == intr_handle->fd) {
 507                        /* we had no interrupts for this */
 508                        if (TAILQ_EMPTY(&src->callbacks))
 509                                wake_thread = 1;
 510
 511                        TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
 512                        ret = 0;
 513                        break;
 514                }
 515        }
 516
 517        /* no existing callbacks for this - add new source */
 518        if (src == NULL) {
 519                src = calloc(1, sizeof(*src));
 520                if (src == NULL) {
 521                        RTE_LOG(ERR, EAL, "Can not allocate memory\n");
 522                        free(callback);
 523                        ret = -ENOMEM;
 524                } else {
 525                        src->intr_handle = *intr_handle;
 526                        TAILQ_INIT(&src->callbacks);
 527                        TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
 528                        TAILQ_INSERT_TAIL(&intr_sources, src, next);
 529                        wake_thread = 1;
 530                        ret = 0;
 531                }
 532        }
 533
 534        rte_spinlock_unlock(&intr_lock);
 535
 536        /**
 537         * check if need to notify the pipe fd waited by epoll_wait to
 538         * rebuild the wait list.
 539         */
 540        if (wake_thread)
 541                if (write(intr_pipe.writefd, "1", 1) < 0)
 542                        ret = -EPIPE;
 543
 544        rte_eal_trace_intr_callback_register(intr_handle, cb, cb_arg, ret);
 545        return ret;
 546}
 547
 548int
 549rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle,
 550                                rte_intr_callback_fn cb_fn, void *cb_arg,
 551                                rte_intr_unregister_callback_fn ucb_fn)
 552{
 553        int ret;
 554        struct rte_intr_source *src;
 555        struct rte_intr_callback *cb, *next;
 556
 557        /* do parameter checking first */
 558        if (intr_handle == NULL || intr_handle->fd < 0) {
 559                RTE_LOG(ERR, EAL,
 560                "Unregistering with invalid input parameter\n");
 561                return -EINVAL;
 562        }
 563
 564        rte_spinlock_lock(&intr_lock);
 565
 566        /* check if the insterrupt source for the fd is existent */
 567        TAILQ_FOREACH(src, &intr_sources, next)
 568                if (src->intr_handle.fd == intr_handle->fd)
 569                        break;
 570
 571        /* No interrupt source registered for the fd */
 572        if (src == NULL) {
 573                ret = -ENOENT;
 574
 575        /* only usable if the source is active */
 576        } else if (src->active == 0) {
 577                ret = -EAGAIN;
 578
 579        } else {
 580                ret = 0;
 581
 582                /* walk through the callbacks and mark all that match. */
 583                for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
 584                        next = TAILQ_NEXT(cb, next);
 585                        if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
 586                                        cb->cb_arg == cb_arg)) {
 587                                cb->pending_delete = 1;
 588                                cb->ucb_fn = ucb_fn;
 589                                ret++;
 590                        }
 591                }
 592        }
 593
 594        rte_spinlock_unlock(&intr_lock);
 595
 596        return ret;
 597}
 598
 599int
 600rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
 601                        rte_intr_callback_fn cb_fn, void *cb_arg)
 602{
 603        int ret;
 604        struct rte_intr_source *src;
 605        struct rte_intr_callback *cb, *next;
 606
 607        /* do parameter checking first */
 608        if (intr_handle == NULL || intr_handle->fd < 0) {
 609                RTE_LOG(ERR, EAL,
 610                "Unregistering with invalid input parameter\n");
 611                return -EINVAL;
 612        }
 613
 614        rte_spinlock_lock(&intr_lock);
 615
 616        /* check if the insterrupt source for the fd is existent */
 617        TAILQ_FOREACH(src, &intr_sources, next)
 618                if (src->intr_handle.fd == intr_handle->fd)
 619                        break;
 620
 621        /* No interrupt source registered for the fd */
 622        if (src == NULL) {
 623                ret = -ENOENT;
 624
 625        /* interrupt source has some active callbacks right now. */
 626        } else if (src->active != 0) {
 627                ret = -EAGAIN;
 628
 629        /* ok to remove. */
 630        } else {
 631                ret = 0;
 632
 633                /*walk through the callbacks and remove all that match. */
 634                for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
 635
 636                        next = TAILQ_NEXT(cb, next);
 637
 638                        if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
 639                                        cb->cb_arg == cb_arg)) {
 640                                TAILQ_REMOVE(&src->callbacks, cb, next);
 641                                free(cb);
 642                                ret++;
 643                        }
 644                }
 645
 646                /* all callbacks for that source are removed. */
 647                if (TAILQ_EMPTY(&src->callbacks)) {
 648                        TAILQ_REMOVE(&intr_sources, src, next);
 649                        free(src);
 650                }
 651        }
 652
 653        rte_spinlock_unlock(&intr_lock);
 654
 655        /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
 656        if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
 657                ret = -EPIPE;
 658        }
 659
 660        rte_eal_trace_intr_callback_unregister(intr_handle, cb_fn, cb_arg,
 661                ret);
 662        return ret;
 663}
 664
 665int
 666rte_intr_enable(const struct rte_intr_handle *intr_handle)
 667{
 668        int rc = 0;
 669
 670        if (intr_handle == NULL)
 671                return -1;
 672
 673        if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
 674                rc = 0;
 675                goto out;
 676        }
 677
 678        if (intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) {
 679                rc = -1;
 680                goto out;
 681        }
 682
 683        switch (intr_handle->type){
 684        /* write to the uio fd to enable the interrupt */
 685        case RTE_INTR_HANDLE_UIO:
 686                if (uio_intr_enable(intr_handle))
 687                        rc = -1;
 688                break;
 689        case RTE_INTR_HANDLE_UIO_INTX:
 690                if (uio_intx_intr_enable(intr_handle))
 691                        rc = -1;
 692                break;
 693        /* not used at this moment */
 694        case RTE_INTR_HANDLE_ALARM:
 695                rc = -1;
 696                break;
 697#ifdef VFIO_PRESENT
 698        case RTE_INTR_HANDLE_VFIO_MSIX:
 699                if (vfio_enable_msix(intr_handle))
 700                        rc = -1;
 701                break;
 702        case RTE_INTR_HANDLE_VFIO_MSI:
 703                if (vfio_enable_msi(intr_handle))
 704                        rc = -1;
 705                break;
 706        case RTE_INTR_HANDLE_VFIO_LEGACY:
 707                if (vfio_enable_intx(intr_handle))
 708                        rc = -1;
 709                break;
 710#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
 711        case RTE_INTR_HANDLE_VFIO_REQ:
 712                if (vfio_enable_req(intr_handle))
 713                        rc = -1;
 714                break;
 715#endif
 716#endif
 717        /* not used at this moment */
 718        case RTE_INTR_HANDLE_DEV_EVENT:
 719                rc = -1;
 720                break;
 721        /* unknown handle type */
 722        default:
 723                RTE_LOG(ERR, EAL,
 724                        "Unknown handle type of fd %d\n",
 725                                        intr_handle->fd);
 726                rc = -1;
 727                break;
 728        }
 729out:
 730        rte_eal_trace_intr_enable(intr_handle, rc);
 731        return rc;
 732}
 733
 734/**
 735 * PMD generally calls this function at the end of its IRQ callback.
 736 * Internally, it unmasks the interrupt if possible.
 737 *
 738 * For INTx, unmasking is required as the interrupt is auto-masked prior to
 739 * invoking callback.
 740 *
 741 * For MSI/MSI-X, unmasking is typically not needed as the interrupt is not
 742 * auto-masked. In fact, for interrupt handle types VFIO_MSIX and VFIO_MSI,
 743 * this function is no-op.
 744 */
 745int
 746rte_intr_ack(const struct rte_intr_handle *intr_handle)
 747{
 748        if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
 749                return 0;
 750
 751        if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
 752                return -1;
 753
 754        switch (intr_handle->type) {
 755        /* Both acking and enabling are same for UIO */
 756        case RTE_INTR_HANDLE_UIO:
 757                if (uio_intr_enable(intr_handle))
 758                        return -1;
 759                break;
 760        case RTE_INTR_HANDLE_UIO_INTX:
 761                if (uio_intx_intr_enable(intr_handle))
 762                        return -1;
 763                break;
 764        /* not used at this moment */
 765        case RTE_INTR_HANDLE_ALARM:
 766                return -1;
 767#ifdef VFIO_PRESENT
 768        /* VFIO MSI* is implicitly acked unlike INTx, nothing to do */
 769        case RTE_INTR_HANDLE_VFIO_MSIX:
 770        case RTE_INTR_HANDLE_VFIO_MSI:
 771                return 0;
 772        case RTE_INTR_HANDLE_VFIO_LEGACY:
 773                if (vfio_ack_intx(intr_handle))
 774                        return -1;
 775                break;
 776#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
 777        case RTE_INTR_HANDLE_VFIO_REQ:
 778                return -1;
 779#endif
 780#endif
 781        /* not used at this moment */
 782        case RTE_INTR_HANDLE_DEV_EVENT:
 783                return -1;
 784        /* unknown handle type */
 785        default:
 786                RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
 787                        intr_handle->fd);
 788                return -1;
 789        }
 790
 791        return 0;
 792}
 793
 794int
 795rte_intr_disable(const struct rte_intr_handle *intr_handle)
 796{
 797        int rc = 0;
 798
 799        if (intr_handle == NULL)
 800                return -1;
 801
 802        if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
 803                rc = 0;
 804                goto out;
 805        }
 806
 807        if (intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) {
 808                rc = -1;
 809                goto out;
 810        }
 811
 812        switch (intr_handle->type){
 813        /* write to the uio fd to disable the interrupt */
 814        case RTE_INTR_HANDLE_UIO:
 815                if (uio_intr_disable(intr_handle))
 816                        rc = -1;
 817                break;
 818        case RTE_INTR_HANDLE_UIO_INTX:
 819                if (uio_intx_intr_disable(intr_handle))
 820                        rc = -1;
 821                break;
 822        /* not used at this moment */
 823        case RTE_INTR_HANDLE_ALARM:
 824                rc = -1;
 825                break;
 826#ifdef VFIO_PRESENT
 827        case RTE_INTR_HANDLE_VFIO_MSIX:
 828                if (vfio_disable_msix(intr_handle))
 829                        rc = -1;
 830                break;
 831        case RTE_INTR_HANDLE_VFIO_MSI:
 832                if (vfio_disable_msi(intr_handle))
 833                        rc = -1;
 834                break;
 835        case RTE_INTR_HANDLE_VFIO_LEGACY:
 836                if (vfio_disable_intx(intr_handle))
 837                        rc = -1;
 838                break;
 839#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
 840        case RTE_INTR_HANDLE_VFIO_REQ:
 841                if (vfio_disable_req(intr_handle))
 842                        rc = -1;
 843                break;
 844#endif
 845#endif
 846        /* not used at this moment */
 847        case RTE_INTR_HANDLE_DEV_EVENT:
 848                rc = -1;
 849                break;
 850        /* unknown handle type */
 851        default:
 852                RTE_LOG(ERR, EAL,
 853                        "Unknown handle type of fd %d\n",
 854                                        intr_handle->fd);
 855                rc = -1;
 856                break;
 857        }
 858out:
 859        rte_eal_trace_intr_disable(intr_handle, rc);
 860        return rc;
 861}
 862
 863static int
 864eal_intr_process_interrupts(struct epoll_event *events, int nfds)
 865{
 866        bool call = false;
 867        int n, bytes_read, rv;
 868        struct rte_intr_source *src;
 869        struct rte_intr_callback *cb, *next;
 870        union rte_intr_read_buffer buf;
 871        struct rte_intr_callback active_cb;
 872
 873        for (n = 0; n < nfds; n++) {
 874
 875                /**
 876                 * if the pipe fd is ready to read, return out to
 877                 * rebuild the wait list.
 878                 */
 879                if (events[n].data.fd == intr_pipe.readfd){
 880                        int r = read(intr_pipe.readfd, buf.charbuf,
 881                                        sizeof(buf.charbuf));
 882                        RTE_SET_USED(r);
 883                        return -1;
 884                }
 885                rte_spinlock_lock(&intr_lock);
 886                TAILQ_FOREACH(src, &intr_sources, next)
 887                        if (src->intr_handle.fd ==
 888                                        events[n].data.fd)
 889                                break;
 890                if (src == NULL){
 891                        rte_spinlock_unlock(&intr_lock);
 892                        continue;
 893                }
 894
 895                /* mark this interrupt source as active and release the lock. */
 896                src->active = 1;
 897                rte_spinlock_unlock(&intr_lock);
 898
 899                /* set the length to be read dor different handle type */
 900                switch (src->intr_handle.type) {
 901                case RTE_INTR_HANDLE_UIO:
 902                case RTE_INTR_HANDLE_UIO_INTX:
 903                        bytes_read = sizeof(buf.uio_intr_count);
 904                        break;
 905                case RTE_INTR_HANDLE_ALARM:
 906                        bytes_read = sizeof(buf.timerfd_num);
 907                        break;
 908#ifdef VFIO_PRESENT
 909                case RTE_INTR_HANDLE_VFIO_MSIX:
 910                case RTE_INTR_HANDLE_VFIO_MSI:
 911                case RTE_INTR_HANDLE_VFIO_LEGACY:
 912                        bytes_read = sizeof(buf.vfio_intr_count);
 913                        break;
 914#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
 915                case RTE_INTR_HANDLE_VFIO_REQ:
 916                        bytes_read = 0;
 917                        call = true;
 918                        break;
 919#endif
 920#endif
 921                case RTE_INTR_HANDLE_VDEV:
 922                case RTE_INTR_HANDLE_EXT:
 923                        bytes_read = 0;
 924                        call = true;
 925                        break;
 926                case RTE_INTR_HANDLE_DEV_EVENT:
 927                        bytes_read = 0;
 928                        call = true;
 929                        break;
 930                default:
 931                        bytes_read = 1;
 932                        break;
 933                }
 934
 935                if (bytes_read > 0) {
 936                        /**
 937                         * read out to clear the ready-to-be-read flag
 938                         * for epoll_wait.
 939                         */
 940                        bytes_read = read(events[n].data.fd, &buf, bytes_read);
 941                        if (bytes_read < 0) {
 942                                if (errno == EINTR || errno == EWOULDBLOCK)
 943                                        continue;
 944
 945                                RTE_LOG(ERR, EAL, "Error reading from file "
 946                                        "descriptor %d: %s\n",
 947                                        events[n].data.fd,
 948                                        strerror(errno));
 949                                /*
 950                                 * The device is unplugged or buggy, remove
 951                                 * it as an interrupt source and return to
 952                                 * force the wait list to be rebuilt.
 953                                 */
 954                                rte_spinlock_lock(&intr_lock);
 955                                TAILQ_REMOVE(&intr_sources, src, next);
 956                                rte_spinlock_unlock(&intr_lock);
 957
 958                                for (cb = TAILQ_FIRST(&src->callbacks); cb;
 959                                                        cb = next) {
 960                                        next = TAILQ_NEXT(cb, next);
 961                                        TAILQ_REMOVE(&src->callbacks, cb, next);
 962                                        free(cb);
 963                                }
 964                                free(src);
 965                                return -1;
 966                        } else if (bytes_read == 0)
 967                                RTE_LOG(ERR, EAL, "Read nothing from file "
 968                                        "descriptor %d\n", events[n].data.fd);
 969                        else
 970                                call = true;
 971                }
 972
 973                /* grab a lock, again to call callbacks and update status. */
 974                rte_spinlock_lock(&intr_lock);
 975
 976                if (call) {
 977
 978                        /* Finally, call all callbacks. */
 979                        TAILQ_FOREACH(cb, &src->callbacks, next) {
 980
 981                                /* make a copy and unlock. */
 982                                active_cb = *cb;
 983                                rte_spinlock_unlock(&intr_lock);
 984
 985                                /* call the actual callback */
 986                                active_cb.cb_fn(active_cb.cb_arg);
 987
 988                                /*get the lock back. */
 989                                rte_spinlock_lock(&intr_lock);
 990                        }
 991                }
 992                /* we done with that interrupt source, release it. */
 993                src->active = 0;
 994
 995                rv = 0;
 996
 997                /* check if any callback are supposed to be removed */
 998                for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
 999                        next = TAILQ_NEXT(cb, next);
1000                        if (cb->pending_delete) {
1001                                TAILQ_REMOVE(&src->callbacks, cb, next);
1002                                if (cb->ucb_fn)
1003                                        cb->ucb_fn(&src->intr_handle, cb->cb_arg);
1004                                free(cb);
1005                                rv++;
1006                        }
1007                }
1008
1009                /* all callbacks for that source are removed. */
1010                if (TAILQ_EMPTY(&src->callbacks)) {
1011                        TAILQ_REMOVE(&intr_sources, src, next);
1012                        free(src);
1013                }
1014
1015                /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
1016                if (rv > 0 && write(intr_pipe.writefd, "1", 1) < 0) {
1017                        rte_spinlock_unlock(&intr_lock);
1018                        return -EPIPE;
1019                }
1020
1021                rte_spinlock_unlock(&intr_lock);
1022        }
1023
1024        return 0;
1025}
1026
1027/**
1028 * It handles all the interrupts.
1029 *
1030 * @param pfd
1031 *  epoll file descriptor.
1032 * @param totalfds
1033 *  The number of file descriptors added in epoll.
1034 *
1035 * @return
1036 *  void
1037 */
1038static void
1039eal_intr_handle_interrupts(int pfd, unsigned totalfds)
1040{
1041        struct epoll_event events[totalfds];
1042        int nfds = 0;
1043
1044        for(;;) {
1045                nfds = epoll_wait(pfd, events, totalfds,
1046                        EAL_INTR_EPOLL_WAIT_FOREVER);
1047                /* epoll_wait fail */
1048                if (nfds < 0) {
1049                        if (errno == EINTR)
1050                                continue;
1051                        RTE_LOG(ERR, EAL,
1052                                "epoll_wait returns with fail\n");
1053                        return;
1054                }
1055                /* epoll_wait timeout, will never happens here */
1056                else if (nfds == 0)
1057                        continue;
1058                /* epoll_wait has at least one fd ready to read */
1059                if (eal_intr_process_interrupts(events, nfds) < 0)
1060                        return;
1061        }
1062}
1063
1064/**
1065 * It builds/rebuilds up the epoll file descriptor with all the
1066 * file descriptors being waited on. Then handles the interrupts.
1067 *
1068 * @param arg
1069 *  pointer. (unused)
1070 *
1071 * @return
1072 *  never return;
1073 */
1074static __rte_noreturn void *
1075eal_intr_thread_main(__rte_unused void *arg)
1076{
1077        /* host thread, never break out */
1078        for (;;) {
1079                /* build up the epoll fd with all descriptors we are to
1080                 * wait on then pass it to the handle_interrupts function
1081                 */
1082                static struct epoll_event pipe_event = {
1083                        .events = EPOLLIN | EPOLLPRI,
1084                };
1085                struct rte_intr_source *src;
1086                unsigned numfds = 0;
1087
1088                /* create epoll fd */
1089                int pfd = epoll_create(1);
1090                if (pfd < 0)
1091                        rte_panic("Cannot create epoll instance\n");
1092
1093                pipe_event.data.fd = intr_pipe.readfd;
1094                /**
1095                 * add pipe fd into wait list, this pipe is used to
1096                 * rebuild the wait list.
1097                 */
1098                if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
1099                                                &pipe_event) < 0) {
1100                        rte_panic("Error adding fd to %d epoll_ctl, %s\n",
1101                                        intr_pipe.readfd, strerror(errno));
1102                }
1103                numfds++;
1104
1105                rte_spinlock_lock(&intr_lock);
1106
1107                TAILQ_FOREACH(src, &intr_sources, next) {
1108                        struct epoll_event ev;
1109
1110                        if (src->callbacks.tqh_first == NULL)
1111                                continue; /* skip those with no callbacks */
1112                        memset(&ev, 0, sizeof(ev));
1113                        ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
1114                        ev.data.fd = src->intr_handle.fd;
1115
1116                        /**
1117                         * add all the uio device file descriptor
1118                         * into wait list.
1119                         */
1120                        if (epoll_ctl(pfd, EPOLL_CTL_ADD,
1121                                        src->intr_handle.fd, &ev) < 0){
1122                                rte_panic("Error adding fd %d epoll_ctl, %s\n",
1123                                        src->intr_handle.fd, strerror(errno));
1124                        }
1125                        else
1126                                numfds++;
1127                }
1128                rte_spinlock_unlock(&intr_lock);
1129                /* serve the interrupt */
1130                eal_intr_handle_interrupts(pfd, numfds);
1131
1132                /**
1133                 * when we return, we need to rebuild the
1134                 * list of fds to monitor.
1135                 */
1136                close(pfd);
1137        }
1138}
1139
1140int
1141rte_eal_intr_init(void)
1142{
1143        int ret = 0;
1144
1145        /* init the global interrupt source head */
1146        TAILQ_INIT(&intr_sources);
1147
1148        /**
1149         * create a pipe which will be waited by epoll and notified to
1150         * rebuild the wait list of epoll.
1151         */
1152        if (pipe(intr_pipe.pipefd) < 0) {
1153                rte_errno = errno;
1154                return -1;
1155        }
1156
1157        /* create the host thread to wait/handle the interrupt */
1158        ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
1159                        eal_intr_thread_main, NULL);
1160        if (ret != 0) {
1161                rte_errno = -ret;
1162                RTE_LOG(ERR, EAL,
1163                        "Failed to create thread for interrupt handling\n");
1164        }
1165
1166        return ret;
1167}
1168
1169static void
1170eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
1171{
1172        union rte_intr_read_buffer buf;
1173        int bytes_read = 0;
1174        int nbytes;
1175
1176        switch (intr_handle->type) {
1177        case RTE_INTR_HANDLE_UIO:
1178        case RTE_INTR_HANDLE_UIO_INTX:
1179                bytes_read = sizeof(buf.uio_intr_count);
1180                break;
1181#ifdef VFIO_PRESENT
1182        case RTE_INTR_HANDLE_VFIO_MSIX:
1183        case RTE_INTR_HANDLE_VFIO_MSI:
1184        case RTE_INTR_HANDLE_VFIO_LEGACY:
1185                bytes_read = sizeof(buf.vfio_intr_count);
1186                break;
1187#endif
1188        case RTE_INTR_HANDLE_VDEV:
1189                bytes_read = intr_handle->efd_counter_size;
1190                /* For vdev, number of bytes to read is set by driver */
1191                break;
1192        case RTE_INTR_HANDLE_EXT:
1193                return;
1194        default:
1195                bytes_read = 1;
1196                RTE_LOG(INFO, EAL, "unexpected intr type\n");
1197                break;
1198        }
1199
1200        /**
1201         * read out to clear the ready-to-be-read flag
1202         * for epoll_wait.
1203         */
1204        if (bytes_read == 0)
1205                return;
1206        do {
1207                nbytes = read(fd, &buf, bytes_read);
1208                if (nbytes < 0) {
1209                        if (errno == EINTR || errno == EWOULDBLOCK ||
1210                            errno == EAGAIN)
1211                                continue;
1212                        RTE_LOG(ERR, EAL,
1213                                "Error reading from fd %d: %s\n",
1214                                fd, strerror(errno));
1215                } else if (nbytes == 0)
1216                        RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
1217                return;
1218        } while (1);
1219}
1220
1221static int
1222eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
1223                        struct rte_epoll_event *events)
1224{
1225        unsigned int i, count = 0;
1226        struct rte_epoll_event *rev;
1227        uint32_t valid_status;
1228
1229        for (i = 0; i < n; i++) {
1230                rev = evs[i].data.ptr;
1231                valid_status =  RTE_EPOLL_VALID;
1232                /* ACQUIRE memory ordering here pairs with RELEASE
1233                 * ordering below acting as a lock to synchronize
1234                 * the event data updating.
1235                 */
1236                if (!rev || !__atomic_compare_exchange_n(&rev->status,
1237                                    &valid_status, RTE_EPOLL_EXEC, 0,
1238                                    __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
1239                        continue;
1240
1241                events[count].status        = RTE_EPOLL_VALID;
1242                events[count].fd            = rev->fd;
1243                events[count].epfd          = rev->epfd;
1244                events[count].epdata.event  = evs[i].events;
1245                events[count].epdata.data   = rev->epdata.data;
1246                if (rev->epdata.cb_fun)
1247                        rev->epdata.cb_fun(rev->fd,
1248                                           rev->epdata.cb_arg);
1249
1250                /* the status update should be observed after
1251                 * the other fields change.
1252                 */
1253                __atomic_store_n(&rev->status, RTE_EPOLL_VALID,
1254                                __ATOMIC_RELEASE);
1255                count++;
1256        }
1257        return count;
1258}
1259
1260static inline int
1261eal_init_tls_epfd(void)
1262{
1263        int pfd = epoll_create(255);
1264
1265        if (pfd < 0) {
1266                RTE_LOG(ERR, EAL,
1267                        "Cannot create epoll instance\n");
1268                return -1;
1269        }
1270        return pfd;
1271}
1272
1273int
1274rte_intr_tls_epfd(void)
1275{
1276        if (RTE_PER_LCORE(_epfd) == -1)
1277                RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
1278
1279        return RTE_PER_LCORE(_epfd);
1280}
1281
1282static int
1283eal_epoll_wait(int epfd, struct rte_epoll_event *events,
1284               int maxevents, int timeout, bool interruptible)
1285{
1286        struct epoll_event evs[maxevents];
1287        int rc;
1288
1289        if (!events) {
1290                RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1291                return -1;
1292        }
1293
1294        /* using per thread epoll fd */
1295        if (epfd == RTE_EPOLL_PER_THREAD)
1296                epfd = rte_intr_tls_epfd();
1297
1298        while (1) {
1299                rc = epoll_wait(epfd, evs, maxevents, timeout);
1300                if (likely(rc > 0)) {
1301                        /* epoll_wait has at least one fd ready to read */
1302                        rc = eal_epoll_process_event(evs, rc, events);
1303                        break;
1304                } else if (rc < 0) {
1305                        if (errno == EINTR) {
1306                                if (interruptible)
1307                                        return -1;
1308                                else
1309                                        continue;
1310                        }
1311                        /* epoll_wait fail */
1312                        RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
1313                                strerror(errno));
1314                        rc = -1;
1315                        break;
1316                } else {
1317                        /* rc == 0, epoll_wait timed out */
1318                        break;
1319                }
1320        }
1321
1322        return rc;
1323}
1324
1325int
1326rte_epoll_wait(int epfd, struct rte_epoll_event *events,
1327               int maxevents, int timeout)
1328{
1329        return eal_epoll_wait(epfd, events, maxevents, timeout, false);
1330}
1331
1332int
1333rte_epoll_wait_interruptible(int epfd, struct rte_epoll_event *events,
1334                             int maxevents, int timeout)
1335{
1336        return eal_epoll_wait(epfd, events, maxevents, timeout, true);
1337}
1338
1339static inline void
1340eal_epoll_data_safe_free(struct rte_epoll_event *ev)
1341{
1342        uint32_t valid_status = RTE_EPOLL_VALID;
1343
1344        while (!__atomic_compare_exchange_n(&ev->status, &valid_status,
1345                    RTE_EPOLL_INVALID, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
1346                while (__atomic_load_n(&ev->status,
1347                                __ATOMIC_RELAXED) != RTE_EPOLL_VALID)
1348                        rte_pause();
1349                valid_status = RTE_EPOLL_VALID;
1350        }
1351        memset(&ev->epdata, 0, sizeof(ev->epdata));
1352        ev->fd = -1;
1353        ev->epfd = -1;
1354}
1355
1356int
1357rte_epoll_ctl(int epfd, int op, int fd,
1358              struct rte_epoll_event *event)
1359{
1360        struct epoll_event ev;
1361
1362        if (!event) {
1363                RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1364                return -1;
1365        }
1366
1367        /* using per thread epoll fd */
1368        if (epfd == RTE_EPOLL_PER_THREAD)
1369                epfd = rte_intr_tls_epfd();
1370
1371        if (op == EPOLL_CTL_ADD) {
1372                __atomic_store_n(&event->status, RTE_EPOLL_VALID,
1373                                __ATOMIC_RELAXED);
1374                event->fd = fd;  /* ignore fd in event */
1375                event->epfd = epfd;
1376                ev.data.ptr = (void *)event;
1377        }
1378
1379        ev.events = event->epdata.event;
1380        if (epoll_ctl(epfd, op, fd, &ev) < 0) {
1381                RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
1382                        op, fd, strerror(errno));
1383                if (op == EPOLL_CTL_ADD)
1384                        /* rollback status when CTL_ADD fail */
1385                        __atomic_store_n(&event->status, RTE_EPOLL_INVALID,
1386                                        __ATOMIC_RELAXED);
1387                return -1;
1388        }
1389
1390        if (op == EPOLL_CTL_DEL && __atomic_load_n(&event->status,
1391                        __ATOMIC_RELAXED) != RTE_EPOLL_INVALID)
1392                eal_epoll_data_safe_free(event);
1393
1394        return 0;
1395}
1396
1397int
1398rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
1399                int op, unsigned int vec, void *data)
1400{
1401        struct rte_epoll_event *rev;
1402        struct rte_epoll_data *epdata;
1403        int epfd_op;
1404        unsigned int efd_idx;
1405        int rc = 0;
1406
1407        efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
1408                (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
1409
1410        if (!intr_handle || intr_handle->nb_efd == 0 ||
1411            efd_idx >= intr_handle->nb_efd) {
1412                RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
1413                return -EPERM;
1414        }
1415
1416        switch (op) {
1417        case RTE_INTR_EVENT_ADD:
1418                epfd_op = EPOLL_CTL_ADD;
1419                rev = &intr_handle->elist[efd_idx];
1420                if (__atomic_load_n(&rev->status,
1421                                __ATOMIC_RELAXED) != RTE_EPOLL_INVALID) {
1422                        RTE_LOG(INFO, EAL, "Event already been added.\n");
1423                        return -EEXIST;
1424                }
1425
1426                /* attach to intr vector fd */
1427                epdata = &rev->epdata;
1428                epdata->event  = EPOLLIN | EPOLLPRI | EPOLLET;
1429                epdata->data   = data;
1430                epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
1431                epdata->cb_arg = (void *)intr_handle;
1432                rc = rte_epoll_ctl(epfd, epfd_op,
1433                                   intr_handle->efds[efd_idx], rev);
1434                if (!rc)
1435                        RTE_LOG(DEBUG, EAL,
1436                                "efd %d associated with vec %d added on epfd %d"
1437                                "\n", rev->fd, vec, epfd);
1438                else
1439                        rc = -EPERM;
1440                break;
1441        case RTE_INTR_EVENT_DEL:
1442                epfd_op = EPOLL_CTL_DEL;
1443                rev = &intr_handle->elist[efd_idx];
1444                if (__atomic_load_n(&rev->status,
1445                                __ATOMIC_RELAXED) == RTE_EPOLL_INVALID) {
1446                        RTE_LOG(INFO, EAL, "Event does not exist.\n");
1447                        return -EPERM;
1448                }
1449
1450                rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
1451                if (rc)
1452                        rc = -EPERM;
1453                break;
1454        default:
1455                RTE_LOG(ERR, EAL, "event op type mismatch\n");
1456                rc = -EPERM;
1457        }
1458
1459        return rc;
1460}
1461
1462void
1463rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
1464{
1465        uint32_t i;
1466        struct rte_epoll_event *rev;
1467
1468        for (i = 0; i < intr_handle->nb_efd; i++) {
1469                rev = &intr_handle->elist[i];
1470                if (__atomic_load_n(&rev->status,
1471                                __ATOMIC_RELAXED) == RTE_EPOLL_INVALID)
1472                        continue;
1473                if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
1474                        /* force free if the entry valid */
1475                        eal_epoll_data_safe_free(rev);
1476                }
1477        }
1478}
1479
1480int
1481rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
1482{
1483        uint32_t i;
1484        int fd;
1485        uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1486
1487        assert(nb_efd != 0);
1488
1489        if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) {
1490                for (i = 0; i < n; i++) {
1491                        fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
1492                        if (fd < 0) {
1493                                RTE_LOG(ERR, EAL,
1494                                        "can't setup eventfd, error %i (%s)\n",
1495                                        errno, strerror(errno));
1496                                return -errno;
1497                        }
1498                        intr_handle->efds[i] = fd;
1499                }
1500                intr_handle->nb_efd   = n;
1501                intr_handle->max_intr = NB_OTHER_INTR + n;
1502        } else if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
1503                /* only check, initialization would be done in vdev driver.*/
1504                if (intr_handle->efd_counter_size >
1505                    sizeof(union rte_intr_read_buffer)) {
1506                        RTE_LOG(ERR, EAL, "the efd_counter_size is oversized");
1507                        return -EINVAL;
1508                }
1509        } else {
1510                intr_handle->efds[0]  = intr_handle->fd;
1511                intr_handle->nb_efd   = RTE_MIN(nb_efd, 1U);
1512                intr_handle->max_intr = NB_OTHER_INTR;
1513        }
1514
1515        return 0;
1516}
1517
1518void
1519rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
1520{
1521        uint32_t i;
1522
1523        rte_intr_free_epoll_fd(intr_handle);
1524        if (intr_handle->max_intr > intr_handle->nb_efd) {
1525                for (i = 0; i < intr_handle->nb_efd; i++)
1526                        close(intr_handle->efds[i]);
1527        }
1528        intr_handle->nb_efd = 0;
1529        intr_handle->max_intr = 0;
1530}
1531
1532int
1533rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
1534{
1535        return !(!intr_handle->nb_efd);
1536}
1537
1538int
1539rte_intr_allow_others(struct rte_intr_handle *intr_handle)
1540{
1541        if (!rte_intr_dp_is_en(intr_handle))
1542                return 1;
1543        else
1544                return !!(intr_handle->max_intr - intr_handle->nb_efd);
1545}
1546
1547int
1548rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
1549{
1550        if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX)
1551                return 1;
1552
1553        if (intr_handle->type == RTE_INTR_HANDLE_VDEV)
1554                return 1;
1555
1556        return 0;
1557}
1558
1559int rte_thread_is_intr(void)
1560{
1561        return pthread_equal(intr_thread, pthread_self());
1562}
1563