dpdk/lib/eal/linux/eal_interrupts.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright(c) 2010-2014 Intel Corporation
   3 */
   4
   5#include <stdio.h>
   6#include <stdint.h>
   7#include <stdlib.h>
   8#include <pthread.h>
   9#include <sys/queue.h>
  10#include <unistd.h>
  11#include <string.h>
  12#include <errno.h>
  13#include <sys/epoll.h>
  14#include <sys/ioctl.h>
  15#include <sys/eventfd.h>
  16#include <assert.h>
  17#include <stdbool.h>
  18
  19#include <rte_common.h>
  20#include <rte_interrupts.h>
  21#include <rte_per_lcore.h>
  22#include <rte_lcore.h>
  23#include <rte_branch_prediction.h>
  24#include <rte_debug.h>
  25#include <rte_log.h>
  26#include <rte_errno.h>
  27#include <rte_spinlock.h>
  28#include <rte_pause.h>
  29#include <rte_vfio.h>
  30#include <rte_eal_trace.h>
  31
  32#include "eal_private.h"
  33
  34#define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
  35#define NB_OTHER_INTR               1
  36
  37static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
  38
  39/**
  40 * union for pipe fds.
  41 */
  42union intr_pipefds{
  43        struct {
  44                int pipefd[2];
  45        };
  46        struct {
  47                int readfd;
  48                int writefd;
  49        };
  50};
  51
  52/**
  53 * union buffer for reading on different devices
  54 */
  55union rte_intr_read_buffer {
  56        int uio_intr_count;              /* for uio device */
  57#ifdef VFIO_PRESENT
  58        uint64_t vfio_intr_count;        /* for vfio device */
  59#endif
  60        uint64_t timerfd_num;            /* for timerfd */
  61        char charbuf[16];                /* for others */
  62};
  63
  64TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
  65TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
  66
  67struct rte_intr_callback {
  68        TAILQ_ENTRY(rte_intr_callback) next;
  69        rte_intr_callback_fn cb_fn;  /**< callback address */
  70        void *cb_arg;                /**< parameter for callback */
  71        uint8_t pending_delete;      /**< delete after callback is called */
  72        rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */
  73};
  74
  75struct rte_intr_source {
  76        TAILQ_ENTRY(rte_intr_source) next;
  77        struct rte_intr_handle *intr_handle; /**< interrupt handle */
  78        struct rte_intr_cb_list callbacks;  /**< user callbacks */
  79        uint32_t active;
  80};
  81
  82/* global spinlock for interrupt data operation */
  83static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
  84
  85/* union buffer for pipe read/write */
  86static union intr_pipefds intr_pipe;
  87
  88/* interrupt sources list */
  89static struct rte_intr_source_list intr_sources;
  90
  91/* interrupt handling thread */
  92static pthread_t intr_thread;
  93
  94/* VFIO interrupts */
  95#ifdef VFIO_PRESENT
  96
  97#define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
  98/* irq set buffer length for queue interrupts and LSC interrupt */
  99#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
 100                              sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
 101
 102/* enable legacy (INTx) interrupts */
 103static int
 104vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
 105        struct vfio_irq_set *irq_set;
 106        char irq_set_buf[IRQ_SET_BUF_LEN];
 107        int len, ret, vfio_dev_fd;
 108        int *fd_ptr;
 109
 110        len = sizeof(irq_set_buf);
 111
 112        /* enable INTx */
 113        irq_set = (struct vfio_irq_set *) irq_set_buf;
 114        irq_set->argsz = len;
 115        irq_set->count = 1;
 116        irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 117        irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
 118        irq_set->start = 0;
 119        fd_ptr = (int *) &irq_set->data;
 120        *fd_ptr = rte_intr_fd_get(intr_handle);
 121
 122        vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
 123        ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 124
 125        if (ret) {
 126                RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
 127                        rte_intr_fd_get(intr_handle));
 128                return -1;
 129        }
 130
 131        /* unmask INTx after enabling */
 132        memset(irq_set, 0, len);
 133        len = sizeof(struct vfio_irq_set);
 134        irq_set->argsz = len;
 135        irq_set->count = 1;
 136        irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
 137        irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
 138        irq_set->start = 0;
 139
 140        ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 141
 142        if (ret) {
 143                RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
 144                        rte_intr_fd_get(intr_handle));
 145                return -1;
 146        }
 147        return 0;
 148}
 149
 150/* disable legacy (INTx) interrupts */
 151static int
 152vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
 153        struct vfio_irq_set *irq_set;
 154        char irq_set_buf[IRQ_SET_BUF_LEN];
 155        int len, ret, vfio_dev_fd;
 156
 157        len = sizeof(struct vfio_irq_set);
 158
 159        /* mask interrupts before disabling */
 160        irq_set = (struct vfio_irq_set *) irq_set_buf;
 161        irq_set->argsz = len;
 162        irq_set->count = 1;
 163        irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
 164        irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
 165        irq_set->start = 0;
 166
 167        vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
 168        ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 169
 170        if (ret) {
 171                RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
 172                        rte_intr_fd_get(intr_handle));
 173                return -1;
 174        }
 175
 176        /* disable INTx*/
 177        memset(irq_set, 0, len);
 178        irq_set->argsz = len;
 179        irq_set->count = 0;
 180        irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
 181        irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
 182        irq_set->start = 0;
 183
 184        ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 185
 186        if (ret) {
 187                RTE_LOG(ERR, EAL, "Error disabling INTx interrupts for fd %d\n",
 188                        rte_intr_fd_get(intr_handle));
 189                return -1;
 190        }
 191        return 0;
 192}
 193
 194/* unmask/ack legacy (INTx) interrupts */
 195static int
 196vfio_ack_intx(const struct rte_intr_handle *intr_handle)
 197{
 198        struct vfio_irq_set irq_set;
 199        int vfio_dev_fd;
 200
 201        /* unmask INTx */
 202        memset(&irq_set, 0, sizeof(irq_set));
 203        irq_set.argsz = sizeof(irq_set);
 204        irq_set.count = 1;
 205        irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
 206        irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
 207        irq_set.start = 0;
 208
 209        vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
 210        if (ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set)) {
 211                RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
 212                        rte_intr_fd_get(intr_handle));
 213                return -1;
 214        }
 215        return 0;
 216}
 217
 218/* enable MSI interrupts */
 219static int
 220vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
 221        int len, ret;
 222        char irq_set_buf[IRQ_SET_BUF_LEN];
 223        struct vfio_irq_set *irq_set;
 224        int *fd_ptr, vfio_dev_fd;
 225
 226        len = sizeof(irq_set_buf);
 227
 228        irq_set = (struct vfio_irq_set *) irq_set_buf;
 229        irq_set->argsz = len;
 230        irq_set->count = 1;
 231        irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 232        irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
 233        irq_set->start = 0;
 234        fd_ptr = (int *) &irq_set->data;
 235        *fd_ptr = rte_intr_fd_get(intr_handle);
 236
 237        vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
 238        ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 239
 240        if (ret) {
 241                RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
 242                        rte_intr_fd_get(intr_handle));
 243                return -1;
 244        }
 245        return 0;
 246}
 247
 248/* disable MSI interrupts */
 249static int
 250vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
 251        struct vfio_irq_set *irq_set;
 252        char irq_set_buf[IRQ_SET_BUF_LEN];
 253        int len, ret, vfio_dev_fd;
 254
 255        len = sizeof(struct vfio_irq_set);
 256
 257        irq_set = (struct vfio_irq_set *) irq_set_buf;
 258        irq_set->argsz = len;
 259        irq_set->count = 0;
 260        irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
 261        irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
 262        irq_set->start = 0;
 263
 264        vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
 265        ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 266        if (ret)
 267                RTE_LOG(ERR, EAL, "Error disabling MSI interrupts for fd %d\n",
 268                        rte_intr_fd_get(intr_handle));
 269
 270        return ret;
 271}
 272
 273/* enable MSI-X interrupts */
 274static int
 275vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
 276        int len, ret;
 277        char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
 278        struct vfio_irq_set *irq_set;
 279        int *fd_ptr, vfio_dev_fd, i;
 280
 281        len = sizeof(irq_set_buf);
 282
 283        irq_set = (struct vfio_irq_set *) irq_set_buf;
 284        irq_set->argsz = len;
 285        /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
 286        irq_set->count = rte_intr_max_intr_get(intr_handle) ?
 287                (rte_intr_max_intr_get(intr_handle) >
 288                 RTE_MAX_RXTX_INTR_VEC_ID + 1 ? RTE_MAX_RXTX_INTR_VEC_ID + 1 :
 289                 rte_intr_max_intr_get(intr_handle)) : 1;
 290
 291        irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 292        irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 293        irq_set->start = 0;
 294        fd_ptr = (int *) &irq_set->data;
 295        /* INTR vector offset 0 reserve for non-efds mapping */
 296        fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = rte_intr_fd_get(intr_handle);
 297        for (i = 0; i < rte_intr_nb_efd_get(intr_handle); i++) {
 298                fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] =
 299                        rte_intr_efds_index_get(intr_handle, i);
 300        }
 301
 302        vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
 303        ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 304
 305        if (ret) {
 306                RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
 307                        rte_intr_fd_get(intr_handle));
 308                return -1;
 309        }
 310
 311        return 0;
 312}
 313
 314/* disable MSI-X interrupts */
 315static int
 316vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
 317        struct vfio_irq_set *irq_set;
 318        char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
 319        int len, ret, vfio_dev_fd;
 320
 321        len = sizeof(struct vfio_irq_set);
 322
 323        irq_set = (struct vfio_irq_set *) irq_set_buf;
 324        irq_set->argsz = len;
 325        irq_set->count = 0;
 326        irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
 327        irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 328        irq_set->start = 0;
 329
 330        vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
 331        ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 332
 333        if (ret)
 334                RTE_LOG(ERR, EAL, "Error disabling MSI-X interrupts for fd %d\n",
 335                        rte_intr_fd_get(intr_handle));
 336
 337        return ret;
 338}
 339
 340#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
 341/* enable req notifier */
 342static int
 343vfio_enable_req(const struct rte_intr_handle *intr_handle)
 344{
 345        int len, ret;
 346        char irq_set_buf[IRQ_SET_BUF_LEN];
 347        struct vfio_irq_set *irq_set;
 348        int *fd_ptr, vfio_dev_fd;
 349
 350        len = sizeof(irq_set_buf);
 351
 352        irq_set = (struct vfio_irq_set *) irq_set_buf;
 353        irq_set->argsz = len;
 354        irq_set->count = 1;
 355        irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
 356                         VFIO_IRQ_SET_ACTION_TRIGGER;
 357        irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
 358        irq_set->start = 0;
 359        fd_ptr = (int *) &irq_set->data;
 360        *fd_ptr = rte_intr_fd_get(intr_handle);
 361
 362        vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
 363        ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 364
 365        if (ret) {
 366                RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
 367                        rte_intr_fd_get(intr_handle));
 368                return -1;
 369        }
 370
 371        return 0;
 372}
 373
 374/* disable req notifier */
 375static int
 376vfio_disable_req(const struct rte_intr_handle *intr_handle)
 377{
 378        struct vfio_irq_set *irq_set;
 379        char irq_set_buf[IRQ_SET_BUF_LEN];
 380        int len, ret, vfio_dev_fd;
 381
 382        len = sizeof(struct vfio_irq_set);
 383
 384        irq_set = (struct vfio_irq_set *) irq_set_buf;
 385        irq_set->argsz = len;
 386        irq_set->count = 0;
 387        irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
 388        irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
 389        irq_set->start = 0;
 390
 391        vfio_dev_fd = rte_intr_dev_fd_get(intr_handle);
 392        ret = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 393
 394        if (ret)
 395                RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
 396                        rte_intr_fd_get(intr_handle));
 397
 398        return ret;
 399}
 400#endif
 401#endif
 402
 403static int
 404uio_intx_intr_disable(const struct rte_intr_handle *intr_handle)
 405{
 406        unsigned char command_high;
 407        int uio_cfg_fd;
 408
 409        /* use UIO config file descriptor for uio_pci_generic */
 410        uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
 411        if (uio_cfg_fd < 0 || pread(uio_cfg_fd, &command_high, 1, 5) != 1) {
 412                RTE_LOG(ERR, EAL,
 413                        "Error reading interrupts status for fd %d\n",
 414                        uio_cfg_fd);
 415                return -1;
 416        }
 417        /* disable interrupts */
 418        command_high |= 0x4;
 419        if (pwrite(uio_cfg_fd, &command_high, 1, 5) != 1) {
 420                RTE_LOG(ERR, EAL,
 421                        "Error disabling interrupts for fd %d\n",
 422                        uio_cfg_fd);
 423                return -1;
 424        }
 425
 426        return 0;
 427}
 428
 429static int
 430uio_intx_intr_enable(const struct rte_intr_handle *intr_handle)
 431{
 432        unsigned char command_high;
 433        int uio_cfg_fd;
 434
 435        /* use UIO config file descriptor for uio_pci_generic */
 436        uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
 437        if (uio_cfg_fd < 0 || pread(uio_cfg_fd, &command_high, 1, 5) != 1) {
 438                RTE_LOG(ERR, EAL,
 439                        "Error reading interrupts status for fd %d\n",
 440                        uio_cfg_fd);
 441                return -1;
 442        }
 443        /* enable interrupts */
 444        command_high &= ~0x4;
 445        if (pwrite(uio_cfg_fd, &command_high, 1, 5) != 1) {
 446                RTE_LOG(ERR, EAL,
 447                        "Error enabling interrupts for fd %d\n",
 448                        uio_cfg_fd);
 449                return -1;
 450        }
 451
 452        return 0;
 453}
 454
 455static int
 456uio_intr_disable(const struct rte_intr_handle *intr_handle)
 457{
 458        const int value = 0;
 459
 460        if (rte_intr_fd_get(intr_handle) < 0 ||
 461            write(rte_intr_fd_get(intr_handle), &value, sizeof(value)) < 0) {
 462                RTE_LOG(ERR, EAL, "Error disabling interrupts for fd %d (%s)\n",
 463                        rte_intr_fd_get(intr_handle), strerror(errno));
 464                return -1;
 465        }
 466        return 0;
 467}
 468
 469static int
 470uio_intr_enable(const struct rte_intr_handle *intr_handle)
 471{
 472        const int value = 1;
 473
 474        if (rte_intr_fd_get(intr_handle) < 0 ||
 475            write(rte_intr_fd_get(intr_handle), &value, sizeof(value)) < 0) {
 476                RTE_LOG(ERR, EAL, "Error enabling interrupts for fd %d (%s)\n",
 477                        rte_intr_fd_get(intr_handle), strerror(errno));
 478                return -1;
 479        }
 480        return 0;
 481}
 482
 483int
 484rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
 485                        rte_intr_callback_fn cb, void *cb_arg)
 486{
 487        int ret, wake_thread;
 488        struct rte_intr_source *src;
 489        struct rte_intr_callback *callback;
 490
 491        wake_thread = 0;
 492
 493        /* first do parameter checking */
 494        if (rte_intr_fd_get(intr_handle) < 0 || cb == NULL) {
 495                RTE_LOG(ERR, EAL, "Registering with invalid input parameter\n");
 496                return -EINVAL;
 497        }
 498
 499        /* allocate a new interrupt callback entity */
 500        callback = calloc(1, sizeof(*callback));
 501        if (callback == NULL) {
 502                RTE_LOG(ERR, EAL, "Can not allocate memory\n");
 503                return -ENOMEM;
 504        }
 505        callback->cb_fn = cb;
 506        callback->cb_arg = cb_arg;
 507        callback->pending_delete = 0;
 508        callback->ucb_fn = NULL;
 509
 510        rte_spinlock_lock(&intr_lock);
 511
 512        /* check if there is at least one callback registered for the fd */
 513        TAILQ_FOREACH(src, &intr_sources, next) {
 514                if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle)) {
 515                        /* we had no interrupts for this */
 516                        if (TAILQ_EMPTY(&src->callbacks))
 517                                wake_thread = 1;
 518
 519                        TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
 520                        ret = 0;
 521                        break;
 522                }
 523        }
 524
 525        /* no existing callbacks for this - add new source */
 526        if (src == NULL) {
 527                src = calloc(1, sizeof(*src));
 528                if (src == NULL) {
 529                        RTE_LOG(ERR, EAL, "Can not allocate memory\n");
 530                        ret = -ENOMEM;
 531                        free(callback);
 532                        callback = NULL;
 533                } else {
 534                        src->intr_handle = rte_intr_instance_dup(intr_handle);
 535                        if (src->intr_handle == NULL) {
 536                                RTE_LOG(ERR, EAL, "Can not create intr instance\n");
 537                                ret = -ENOMEM;
 538                                free(callback);
 539                                callback = NULL;
 540                                free(src);
 541                                src = NULL;
 542                        } else {
 543                                TAILQ_INIT(&src->callbacks);
 544                                TAILQ_INSERT_TAIL(&(src->callbacks), callback,
 545                                                  next);
 546                                TAILQ_INSERT_TAIL(&intr_sources, src, next);
 547                                wake_thread = 1;
 548                                ret = 0;
 549                        }
 550                }
 551        }
 552
 553        rte_spinlock_unlock(&intr_lock);
 554
 555        /**
 556         * check if need to notify the pipe fd waited by epoll_wait to
 557         * rebuild the wait list.
 558         */
 559        if (wake_thread)
 560                if (write(intr_pipe.writefd, "1", 1) < 0)
 561                        ret = -EPIPE;
 562
 563        rte_eal_trace_intr_callback_register(intr_handle, cb, cb_arg, ret);
 564        return ret;
 565}
 566
 567int
 568rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle,
 569                                rte_intr_callback_fn cb_fn, void *cb_arg,
 570                                rte_intr_unregister_callback_fn ucb_fn)
 571{
 572        int ret;
 573        struct rte_intr_source *src;
 574        struct rte_intr_callback *cb, *next;
 575
 576        /* do parameter checking first */
 577        if (rte_intr_fd_get(intr_handle) < 0) {
 578                RTE_LOG(ERR, EAL, "Unregistering with invalid input parameter\n");
 579                return -EINVAL;
 580        }
 581
 582        rte_spinlock_lock(&intr_lock);
 583
 584        /* check if the interrupt source for the fd is existent */
 585        TAILQ_FOREACH(src, &intr_sources, next) {
 586                if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle))
 587                        break;
 588        }
 589
 590        /* No interrupt source registered for the fd */
 591        if (src == NULL) {
 592                ret = -ENOENT;
 593
 594        /* only usable if the source is active */
 595        } else if (src->active == 0) {
 596                ret = -EAGAIN;
 597
 598        } else {
 599                ret = 0;
 600
 601                /* walk through the callbacks and mark all that match. */
 602                for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
 603                        next = TAILQ_NEXT(cb, next);
 604                        if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
 605                                        cb->cb_arg == cb_arg)) {
 606                                cb->pending_delete = 1;
 607                                cb->ucb_fn = ucb_fn;
 608                                ret++;
 609                        }
 610                }
 611        }
 612
 613        rte_spinlock_unlock(&intr_lock);
 614
 615        return ret;
 616}
 617
 618int
 619rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
 620                        rte_intr_callback_fn cb_fn, void *cb_arg)
 621{
 622        int ret;
 623        struct rte_intr_source *src;
 624        struct rte_intr_callback *cb, *next;
 625
 626        /* do parameter checking first */
 627        if (rte_intr_fd_get(intr_handle) < 0) {
 628                RTE_LOG(ERR, EAL, "Unregistering with invalid input parameter\n");
 629                return -EINVAL;
 630        }
 631
 632        rte_spinlock_lock(&intr_lock);
 633
 634        /* check if the interrupt source for the fd is existent */
 635        TAILQ_FOREACH(src, &intr_sources, next)
 636                if (rte_intr_fd_get(src->intr_handle) == rte_intr_fd_get(intr_handle))
 637                        break;
 638
 639        /* No interrupt source registered for the fd */
 640        if (src == NULL) {
 641                ret = -ENOENT;
 642
 643        /* interrupt source has some active callbacks right now. */
 644        } else if (src->active != 0) {
 645                ret = -EAGAIN;
 646
 647        /* ok to remove. */
 648        } else {
 649                ret = 0;
 650
 651                /*walk through the callbacks and remove all that match. */
 652                for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
 653
 654                        next = TAILQ_NEXT(cb, next);
 655
 656                        if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
 657                                        cb->cb_arg == cb_arg)) {
 658                                TAILQ_REMOVE(&src->callbacks, cb, next);
 659                                free(cb);
 660                                ret++;
 661                        }
 662                }
 663
 664                /* all callbacks for that source are removed. */
 665                if (TAILQ_EMPTY(&src->callbacks)) {
 666                        TAILQ_REMOVE(&intr_sources, src, next);
 667                        rte_intr_instance_free(src->intr_handle);
 668                        free(src);
 669                }
 670        }
 671
 672        rte_spinlock_unlock(&intr_lock);
 673
 674        /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
 675        if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
 676                ret = -EPIPE;
 677        }
 678
 679        rte_eal_trace_intr_callback_unregister(intr_handle, cb_fn, cb_arg,
 680                ret);
 681        return ret;
 682}
 683
 684int
 685rte_intr_callback_unregister_sync(const struct rte_intr_handle *intr_handle,
 686                        rte_intr_callback_fn cb_fn, void *cb_arg)
 687{
 688        int ret = 0;
 689
 690        while ((ret = rte_intr_callback_unregister(intr_handle, cb_fn, cb_arg)) == -EAGAIN)
 691                rte_pause();
 692
 693        return ret;
 694}
 695
 696int
 697rte_intr_enable(const struct rte_intr_handle *intr_handle)
 698{
 699        int rc = 0, uio_cfg_fd;
 700
 701        if (intr_handle == NULL)
 702                return -1;
 703
 704        if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
 705                rc = 0;
 706                goto out;
 707        }
 708
 709        uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
 710        if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0) {
 711                rc = -1;
 712                goto out;
 713        }
 714
 715        switch (rte_intr_type_get(intr_handle)) {
 716        /* write to the uio fd to enable the interrupt */
 717        case RTE_INTR_HANDLE_UIO:
 718                if (uio_intr_enable(intr_handle))
 719                        rc = -1;
 720                break;
 721        case RTE_INTR_HANDLE_UIO_INTX:
 722                if (uio_intx_intr_enable(intr_handle))
 723                        rc = -1;
 724                break;
 725        /* not used at this moment */
 726        case RTE_INTR_HANDLE_ALARM:
 727                rc = -1;
 728                break;
 729#ifdef VFIO_PRESENT
 730        case RTE_INTR_HANDLE_VFIO_MSIX:
 731                if (vfio_enable_msix(intr_handle))
 732                        rc = -1;
 733                break;
 734        case RTE_INTR_HANDLE_VFIO_MSI:
 735                if (vfio_enable_msi(intr_handle))
 736                        rc = -1;
 737                break;
 738        case RTE_INTR_HANDLE_VFIO_LEGACY:
 739                if (vfio_enable_intx(intr_handle))
 740                        rc = -1;
 741                break;
 742#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
 743        case RTE_INTR_HANDLE_VFIO_REQ:
 744                if (vfio_enable_req(intr_handle))
 745                        rc = -1;
 746                break;
 747#endif
 748#endif
 749        /* not used at this moment */
 750        case RTE_INTR_HANDLE_DEV_EVENT:
 751                rc = -1;
 752                break;
 753        /* unknown handle type */
 754        default:
 755                RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
 756                        rte_intr_fd_get(intr_handle));
 757                rc = -1;
 758                break;
 759        }
 760out:
 761        rte_eal_trace_intr_enable(intr_handle, rc);
 762        return rc;
 763}
 764
 765/**
 766 * PMD generally calls this function at the end of its IRQ callback.
 767 * Internally, it unmasks the interrupt if possible.
 768 *
 769 * For INTx, unmasking is required as the interrupt is auto-masked prior to
 770 * invoking callback.
 771 *
 772 * For MSI/MSI-X, unmasking is typically not needed as the interrupt is not
 773 * auto-masked. In fact, for interrupt handle types VFIO_MSIX and VFIO_MSI,
 774 * this function is no-op.
 775 */
 776int
 777rte_intr_ack(const struct rte_intr_handle *intr_handle)
 778{
 779        int uio_cfg_fd;
 780
 781        if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV)
 782                return 0;
 783
 784        uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
 785        if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0)
 786                return -1;
 787
 788        switch (rte_intr_type_get(intr_handle)) {
 789        /* Both acking and enabling are same for UIO */
 790        case RTE_INTR_HANDLE_UIO:
 791                if (uio_intr_enable(intr_handle))
 792                        return -1;
 793                break;
 794        case RTE_INTR_HANDLE_UIO_INTX:
 795                if (uio_intx_intr_enable(intr_handle))
 796                        return -1;
 797                break;
 798        /* not used at this moment */
 799        case RTE_INTR_HANDLE_ALARM:
 800                return -1;
 801#ifdef VFIO_PRESENT
 802        /* VFIO MSI* is implicitly acked unlike INTx, nothing to do */
 803        case RTE_INTR_HANDLE_VFIO_MSIX:
 804        case RTE_INTR_HANDLE_VFIO_MSI:
 805                return 0;
 806        case RTE_INTR_HANDLE_VFIO_LEGACY:
 807                if (vfio_ack_intx(intr_handle))
 808                        return -1;
 809                break;
 810#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
 811        case RTE_INTR_HANDLE_VFIO_REQ:
 812                return -1;
 813#endif
 814#endif
 815        /* not used at this moment */
 816        case RTE_INTR_HANDLE_DEV_EVENT:
 817                return -1;
 818        /* unknown handle type */
 819        default:
 820                RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
 821                        rte_intr_fd_get(intr_handle));
 822                return -1;
 823        }
 824
 825        return 0;
 826}
 827
 828int
 829rte_intr_disable(const struct rte_intr_handle *intr_handle)
 830{
 831        int rc = 0, uio_cfg_fd;
 832
 833        if (intr_handle == NULL)
 834                return -1;
 835
 836        if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
 837                rc = 0;
 838                goto out;
 839        }
 840
 841        uio_cfg_fd = rte_intr_dev_fd_get(intr_handle);
 842        if (rte_intr_fd_get(intr_handle) < 0 || uio_cfg_fd < 0) {
 843                rc = -1;
 844                goto out;
 845        }
 846
 847        switch (rte_intr_type_get(intr_handle)) {
 848        /* write to the uio fd to disable the interrupt */
 849        case RTE_INTR_HANDLE_UIO:
 850                if (uio_intr_disable(intr_handle))
 851                        rc = -1;
 852                break;
 853        case RTE_INTR_HANDLE_UIO_INTX:
 854                if (uio_intx_intr_disable(intr_handle))
 855                        rc = -1;
 856                break;
 857        /* not used at this moment */
 858        case RTE_INTR_HANDLE_ALARM:
 859                rc = -1;
 860                break;
 861#ifdef VFIO_PRESENT
 862        case RTE_INTR_HANDLE_VFIO_MSIX:
 863                if (vfio_disable_msix(intr_handle))
 864                        rc = -1;
 865                break;
 866        case RTE_INTR_HANDLE_VFIO_MSI:
 867                if (vfio_disable_msi(intr_handle))
 868                        rc = -1;
 869                break;
 870        case RTE_INTR_HANDLE_VFIO_LEGACY:
 871                if (vfio_disable_intx(intr_handle))
 872                        rc = -1;
 873                break;
 874#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
 875        case RTE_INTR_HANDLE_VFIO_REQ:
 876                if (vfio_disable_req(intr_handle))
 877                        rc = -1;
 878                break;
 879#endif
 880#endif
 881        /* not used at this moment */
 882        case RTE_INTR_HANDLE_DEV_EVENT:
 883                rc = -1;
 884                break;
 885        /* unknown handle type */
 886        default:
 887                RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
 888                        rte_intr_fd_get(intr_handle));
 889                rc = -1;
 890                break;
 891        }
 892out:
 893        rte_eal_trace_intr_disable(intr_handle, rc);
 894        return rc;
 895}
 896
 897static int
 898eal_intr_process_interrupts(struct epoll_event *events, int nfds)
 899{
 900        bool call = false;
 901        int n, bytes_read, rv;
 902        struct rte_intr_source *src;
 903        struct rte_intr_callback *cb, *next;
 904        union rte_intr_read_buffer buf;
 905        struct rte_intr_callback active_cb;
 906
 907        for (n = 0; n < nfds; n++) {
 908
 909                /**
 910                 * if the pipe fd is ready to read, return out to
 911                 * rebuild the wait list.
 912                 */
 913                if (events[n].data.fd == intr_pipe.readfd){
 914                        int r = read(intr_pipe.readfd, buf.charbuf,
 915                                        sizeof(buf.charbuf));
 916                        RTE_SET_USED(r);
 917                        return -1;
 918                }
 919                rte_spinlock_lock(&intr_lock);
 920                TAILQ_FOREACH(src, &intr_sources, next)
 921                        if (rte_intr_fd_get(src->intr_handle) == events[n].data.fd)
 922                                break;
 923                if (src == NULL){
 924                        rte_spinlock_unlock(&intr_lock);
 925                        continue;
 926                }
 927
 928                /* mark this interrupt source as active and release the lock. */
 929                src->active = 1;
 930                rte_spinlock_unlock(&intr_lock);
 931
 932                /* set the length to be read dor different handle type */
 933                switch (rte_intr_type_get(src->intr_handle)) {
 934                case RTE_INTR_HANDLE_UIO:
 935                case RTE_INTR_HANDLE_UIO_INTX:
 936                        bytes_read = sizeof(buf.uio_intr_count);
 937                        break;
 938                case RTE_INTR_HANDLE_ALARM:
 939                        bytes_read = sizeof(buf.timerfd_num);
 940                        break;
 941#ifdef VFIO_PRESENT
 942#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
 943                case RTE_INTR_HANDLE_VFIO_REQ:
 944#endif
 945                case RTE_INTR_HANDLE_VFIO_MSIX:
 946                case RTE_INTR_HANDLE_VFIO_MSI:
 947                case RTE_INTR_HANDLE_VFIO_LEGACY:
 948                        bytes_read = sizeof(buf.vfio_intr_count);
 949                        break;
 950#endif
 951                case RTE_INTR_HANDLE_VDEV:
 952                case RTE_INTR_HANDLE_EXT:
 953                        bytes_read = 0;
 954                        call = true;
 955                        break;
 956                case RTE_INTR_HANDLE_DEV_EVENT:
 957                        bytes_read = 0;
 958                        call = true;
 959                        break;
 960                default:
 961                        bytes_read = 1;
 962                        break;
 963                }
 964
 965                if (bytes_read > 0) {
 966                        /**
 967                         * read out to clear the ready-to-be-read flag
 968                         * for epoll_wait.
 969                         */
 970                        bytes_read = read(events[n].data.fd, &buf, bytes_read);
 971                        if (bytes_read < 0) {
 972                                if (errno == EINTR || errno == EWOULDBLOCK)
 973                                        continue;
 974
 975                                RTE_LOG(ERR, EAL, "Error reading from file "
 976                                        "descriptor %d: %s\n",
 977                                        events[n].data.fd,
 978                                        strerror(errno));
 979                                /*
 980                                 * The device is unplugged or buggy, remove
 981                                 * it as an interrupt source and return to
 982                                 * force the wait list to be rebuilt.
 983                                 */
 984                                rte_spinlock_lock(&intr_lock);
 985                                TAILQ_REMOVE(&intr_sources, src, next);
 986                                rte_spinlock_unlock(&intr_lock);
 987
 988                                for (cb = TAILQ_FIRST(&src->callbacks); cb;
 989                                                        cb = next) {
 990                                        next = TAILQ_NEXT(cb, next);
 991                                        TAILQ_REMOVE(&src->callbacks, cb, next);
 992                                        free(cb);
 993                                }
 994                                rte_intr_instance_free(src->intr_handle);
 995                                free(src);
 996                                return -1;
 997                        } else if (bytes_read == 0)
 998                                RTE_LOG(ERR, EAL, "Read nothing from file "
 999                                        "descriptor %d\n", events[n].data.fd);
1000                        else
1001                                call = true;
1002                }
1003
1004                /* grab a lock, again to call callbacks and update status. */
1005                rte_spinlock_lock(&intr_lock);
1006
1007                if (call) {
1008
1009                        /* Finally, call all callbacks. */
1010                        TAILQ_FOREACH(cb, &src->callbacks, next) {
1011
1012                                /* make a copy and unlock. */
1013                                active_cb = *cb;
1014                                rte_spinlock_unlock(&intr_lock);
1015
1016                                /* call the actual callback */
1017                                active_cb.cb_fn(active_cb.cb_arg);
1018
1019                                /*get the lock back. */
1020                                rte_spinlock_lock(&intr_lock);
1021                        }
1022                }
1023                /* we done with that interrupt source, release it. */
1024                src->active = 0;
1025
1026                rv = 0;
1027
1028                /* check if any callback are supposed to be removed */
1029                for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
1030                        next = TAILQ_NEXT(cb, next);
1031                        if (cb->pending_delete) {
1032                                TAILQ_REMOVE(&src->callbacks, cb, next);
1033                                if (cb->ucb_fn)
1034                                        cb->ucb_fn(src->intr_handle, cb->cb_arg);
1035                                free(cb);
1036                                rv++;
1037                        }
1038                }
1039
1040                /* all callbacks for that source are removed. */
1041                if (TAILQ_EMPTY(&src->callbacks)) {
1042                        TAILQ_REMOVE(&intr_sources, src, next);
1043                        rte_intr_instance_free(src->intr_handle);
1044                        free(src);
1045                }
1046
1047                /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
1048                if (rv > 0 && write(intr_pipe.writefd, "1", 1) < 0) {
1049                        rte_spinlock_unlock(&intr_lock);
1050                        return -EPIPE;
1051                }
1052
1053                rte_spinlock_unlock(&intr_lock);
1054        }
1055
1056        return 0;
1057}
1058
1059/**
1060 * It handles all the interrupts.
1061 *
1062 * @param pfd
1063 *  epoll file descriptor.
1064 * @param totalfds
1065 *  The number of file descriptors added in epoll.
1066 *
1067 * @return
1068 *  void
1069 */
1070static void
1071eal_intr_handle_interrupts(int pfd, unsigned totalfds)
1072{
1073        struct epoll_event events[totalfds];
1074        int nfds = 0;
1075
1076        for(;;) {
1077                nfds = epoll_wait(pfd, events, totalfds,
1078                        EAL_INTR_EPOLL_WAIT_FOREVER);
1079                /* epoll_wait fail */
1080                if (nfds < 0) {
1081                        if (errno == EINTR)
1082                                continue;
1083                        RTE_LOG(ERR, EAL,
1084                                "epoll_wait returns with fail\n");
1085                        return;
1086                }
1087                /* epoll_wait timeout, will never happens here */
1088                else if (nfds == 0)
1089                        continue;
1090                /* epoll_wait has at least one fd ready to read */
1091                if (eal_intr_process_interrupts(events, nfds) < 0)
1092                        return;
1093        }
1094}
1095
1096/**
1097 * It builds/rebuilds up the epoll file descriptor with all the
1098 * file descriptors being waited on. Then handles the interrupts.
1099 *
1100 * @param arg
1101 *  pointer. (unused)
1102 *
1103 * @return
1104 *  never return;
1105 */
1106static __rte_noreturn void *
1107eal_intr_thread_main(__rte_unused void *arg)
1108{
1109        /* host thread, never break out */
1110        for (;;) {
1111                /* build up the epoll fd with all descriptors we are to
1112                 * wait on then pass it to the handle_interrupts function
1113                 */
1114                static struct epoll_event pipe_event = {
1115                        .events = EPOLLIN | EPOLLPRI,
1116                };
1117                struct rte_intr_source *src;
1118                unsigned numfds = 0;
1119
1120                /* create epoll fd */
1121                int pfd = epoll_create(1);
1122                if (pfd < 0)
1123                        rte_panic("Cannot create epoll instance\n");
1124
1125                pipe_event.data.fd = intr_pipe.readfd;
1126                /**
1127                 * add pipe fd into wait list, this pipe is used to
1128                 * rebuild the wait list.
1129                 */
1130                if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
1131                                                &pipe_event) < 0) {
1132                        rte_panic("Error adding fd to %d epoll_ctl, %s\n",
1133                                        intr_pipe.readfd, strerror(errno));
1134                }
1135                numfds++;
1136
1137                rte_spinlock_lock(&intr_lock);
1138
1139                TAILQ_FOREACH(src, &intr_sources, next) {
1140                        struct epoll_event ev;
1141
1142                        if (src->callbacks.tqh_first == NULL)
1143                                continue; /* skip those with no callbacks */
1144                        memset(&ev, 0, sizeof(ev));
1145                        ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
1146                        ev.data.fd = rte_intr_fd_get(src->intr_handle);
1147
1148                        /**
1149                         * add all the uio device file descriptor
1150                         * into wait list.
1151                         */
1152                        if (epoll_ctl(pfd, EPOLL_CTL_ADD,
1153                                        rte_intr_fd_get(src->intr_handle), &ev) < 0) {
1154                                rte_panic("Error adding fd %d epoll_ctl, %s\n",
1155                                        rte_intr_fd_get(src->intr_handle),
1156                                        strerror(errno));
1157                        }
1158                        else
1159                                numfds++;
1160                }
1161                rte_spinlock_unlock(&intr_lock);
1162                /* serve the interrupt */
1163                eal_intr_handle_interrupts(pfd, numfds);
1164
1165                /**
1166                 * when we return, we need to rebuild the
1167                 * list of fds to monitor.
1168                 */
1169                close(pfd);
1170        }
1171}
1172
1173int
1174rte_eal_intr_init(void)
1175{
1176        int ret = 0;
1177
1178        /* init the global interrupt source head */
1179        TAILQ_INIT(&intr_sources);
1180
1181        /**
1182         * create a pipe which will be waited by epoll and notified to
1183         * rebuild the wait list of epoll.
1184         */
1185        if (pipe(intr_pipe.pipefd) < 0) {
1186                rte_errno = errno;
1187                return -1;
1188        }
1189
1190        /* create the host thread to wait/handle the interrupt */
1191        ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
1192                        eal_intr_thread_main, NULL);
1193        if (ret != 0) {
1194                rte_errno = -ret;
1195                RTE_LOG(ERR, EAL,
1196                        "Failed to create thread for interrupt handling\n");
1197        }
1198
1199        return ret;
1200}
1201
1202static void
1203eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
1204{
1205        union rte_intr_read_buffer buf;
1206        int bytes_read = 0;
1207        int nbytes;
1208
1209        switch (rte_intr_type_get(intr_handle)) {
1210        case RTE_INTR_HANDLE_UIO:
1211        case RTE_INTR_HANDLE_UIO_INTX:
1212                bytes_read = sizeof(buf.uio_intr_count);
1213                break;
1214#ifdef VFIO_PRESENT
1215        case RTE_INTR_HANDLE_VFIO_MSIX:
1216        case RTE_INTR_HANDLE_VFIO_MSI:
1217        case RTE_INTR_HANDLE_VFIO_LEGACY:
1218                bytes_read = sizeof(buf.vfio_intr_count);
1219                break;
1220#endif
1221        case RTE_INTR_HANDLE_VDEV:
1222                bytes_read = rte_intr_efd_counter_size_get(intr_handle);
1223                /* For vdev, number of bytes to read is set by driver */
1224                break;
1225        case RTE_INTR_HANDLE_EXT:
1226                return;
1227        default:
1228                bytes_read = 1;
1229                RTE_LOG(INFO, EAL, "unexpected intr type\n");
1230                break;
1231        }
1232
1233        /**
1234         * read out to clear the ready-to-be-read flag
1235         * for epoll_wait.
1236         */
1237        if (bytes_read == 0)
1238                return;
1239        do {
1240                nbytes = read(fd, &buf, bytes_read);
1241                if (nbytes < 0) {
1242                        if (errno == EINTR || errno == EWOULDBLOCK ||
1243                            errno == EAGAIN)
1244                                continue;
1245                        RTE_LOG(ERR, EAL,
1246                                "Error reading from fd %d: %s\n",
1247                                fd, strerror(errno));
1248                } else if (nbytes == 0)
1249                        RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
1250                return;
1251        } while (1);
1252}
1253
1254static int
1255eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
1256                        struct rte_epoll_event *events)
1257{
1258        unsigned int i, count = 0;
1259        struct rte_epoll_event *rev;
1260        uint32_t valid_status;
1261
1262        for (i = 0; i < n; i++) {
1263                rev = evs[i].data.ptr;
1264                valid_status =  RTE_EPOLL_VALID;
1265                /* ACQUIRE memory ordering here pairs with RELEASE
1266                 * ordering below acting as a lock to synchronize
1267                 * the event data updating.
1268                 */
1269                if (!rev || !__atomic_compare_exchange_n(&rev->status,
1270                                    &valid_status, RTE_EPOLL_EXEC, 0,
1271                                    __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
1272                        continue;
1273
1274                events[count].status        = RTE_EPOLL_VALID;
1275                events[count].fd            = rev->fd;
1276                events[count].epfd          = rev->epfd;
1277                events[count].epdata.event  = evs[i].events;
1278                events[count].epdata.data   = rev->epdata.data;
1279                if (rev->epdata.cb_fun)
1280                        rev->epdata.cb_fun(rev->fd,
1281                                           rev->epdata.cb_arg);
1282
1283                /* the status update should be observed after
1284                 * the other fields change.
1285                 */
1286                __atomic_store_n(&rev->status, RTE_EPOLL_VALID,
1287                                __ATOMIC_RELEASE);
1288                count++;
1289        }
1290        return count;
1291}
1292
1293static inline int
1294eal_init_tls_epfd(void)
1295{
1296        int pfd = epoll_create(255);
1297
1298        if (pfd < 0) {
1299                RTE_LOG(ERR, EAL,
1300                        "Cannot create epoll instance\n");
1301                return -1;
1302        }
1303        return pfd;
1304}
1305
1306int
1307rte_intr_tls_epfd(void)
1308{
1309        if (RTE_PER_LCORE(_epfd) == -1)
1310                RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
1311
1312        return RTE_PER_LCORE(_epfd);
1313}
1314
1315static int
1316eal_epoll_wait(int epfd, struct rte_epoll_event *events,
1317               int maxevents, int timeout, bool interruptible)
1318{
1319        struct epoll_event evs[maxevents];
1320        int rc;
1321
1322        if (!events) {
1323                RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1324                return -1;
1325        }
1326
1327        /* using per thread epoll fd */
1328        if (epfd == RTE_EPOLL_PER_THREAD)
1329                epfd = rte_intr_tls_epfd();
1330
1331        while (1) {
1332                rc = epoll_wait(epfd, evs, maxevents, timeout);
1333                if (likely(rc > 0)) {
1334                        /* epoll_wait has at least one fd ready to read */
1335                        rc = eal_epoll_process_event(evs, rc, events);
1336                        break;
1337                } else if (rc < 0) {
1338                        if (errno == EINTR) {
1339                                if (interruptible)
1340                                        return -1;
1341                                else
1342                                        continue;
1343                        }
1344                        /* epoll_wait fail */
1345                        RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
1346                                strerror(errno));
1347                        rc = -1;
1348                        break;
1349                } else {
1350                        /* rc == 0, epoll_wait timed out */
1351                        break;
1352                }
1353        }
1354
1355        return rc;
1356}
1357
1358int
1359rte_epoll_wait(int epfd, struct rte_epoll_event *events,
1360               int maxevents, int timeout)
1361{
1362        return eal_epoll_wait(epfd, events, maxevents, timeout, false);
1363}
1364
1365int
1366rte_epoll_wait_interruptible(int epfd, struct rte_epoll_event *events,
1367                             int maxevents, int timeout)
1368{
1369        return eal_epoll_wait(epfd, events, maxevents, timeout, true);
1370}
1371
1372static inline void
1373eal_epoll_data_safe_free(struct rte_epoll_event *ev)
1374{
1375        uint32_t valid_status = RTE_EPOLL_VALID;
1376
1377        while (!__atomic_compare_exchange_n(&ev->status, &valid_status,
1378                    RTE_EPOLL_INVALID, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
1379                while (__atomic_load_n(&ev->status,
1380                                __ATOMIC_RELAXED) != RTE_EPOLL_VALID)
1381                        rte_pause();
1382                valid_status = RTE_EPOLL_VALID;
1383        }
1384        memset(&ev->epdata, 0, sizeof(ev->epdata));
1385        ev->fd = -1;
1386        ev->epfd = -1;
1387}
1388
1389int
1390rte_epoll_ctl(int epfd, int op, int fd,
1391              struct rte_epoll_event *event)
1392{
1393        struct epoll_event ev;
1394
1395        if (!event) {
1396                RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
1397                return -1;
1398        }
1399
1400        /* using per thread epoll fd */
1401        if (epfd == RTE_EPOLL_PER_THREAD)
1402                epfd = rte_intr_tls_epfd();
1403
1404        if (op == EPOLL_CTL_ADD) {
1405                __atomic_store_n(&event->status, RTE_EPOLL_VALID,
1406                                __ATOMIC_RELAXED);
1407                event->fd = fd;  /* ignore fd in event */
1408                event->epfd = epfd;
1409                ev.data.ptr = (void *)event;
1410        }
1411
1412        ev.events = event->epdata.event;
1413        if (epoll_ctl(epfd, op, fd, &ev) < 0) {
1414                RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
1415                        op, fd, strerror(errno));
1416                if (op == EPOLL_CTL_ADD)
1417                        /* rollback status when CTL_ADD fail */
1418                        __atomic_store_n(&event->status, RTE_EPOLL_INVALID,
1419                                        __ATOMIC_RELAXED);
1420                return -1;
1421        }
1422
1423        if (op == EPOLL_CTL_DEL && __atomic_load_n(&event->status,
1424                        __ATOMIC_RELAXED) != RTE_EPOLL_INVALID)
1425                eal_epoll_data_safe_free(event);
1426
1427        return 0;
1428}
1429
1430int
1431rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
1432                int op, unsigned int vec, void *data)
1433{
1434        struct rte_epoll_event *rev;
1435        struct rte_epoll_data *epdata;
1436        int epfd_op;
1437        unsigned int efd_idx;
1438        int rc = 0;
1439
1440        efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
1441                (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
1442
1443        if (intr_handle == NULL || rte_intr_nb_efd_get(intr_handle) == 0 ||
1444                        efd_idx >= (unsigned int)rte_intr_nb_efd_get(intr_handle)) {
1445                RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
1446                return -EPERM;
1447        }
1448
1449        switch (op) {
1450        case RTE_INTR_EVENT_ADD:
1451                epfd_op = EPOLL_CTL_ADD;
1452                rev = rte_intr_elist_index_get(intr_handle, efd_idx);
1453                if (__atomic_load_n(&rev->status,
1454                                __ATOMIC_RELAXED) != RTE_EPOLL_INVALID) {
1455                        RTE_LOG(INFO, EAL, "Event already been added.\n");
1456                        return -EEXIST;
1457                }
1458
1459                /* attach to intr vector fd */
1460                epdata = &rev->epdata;
1461                epdata->event  = EPOLLIN | EPOLLPRI | EPOLLET;
1462                epdata->data   = data;
1463                epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
1464                epdata->cb_arg = (void *)intr_handle;
1465                rc = rte_epoll_ctl(epfd, epfd_op,
1466                        rte_intr_efds_index_get(intr_handle, efd_idx), rev);
1467                if (!rc)
1468                        RTE_LOG(DEBUG, EAL,
1469                                "efd %d associated with vec %d added on epfd %d"
1470                                "\n", rev->fd, vec, epfd);
1471                else
1472                        rc = -EPERM;
1473                break;
1474        case RTE_INTR_EVENT_DEL:
1475                epfd_op = EPOLL_CTL_DEL;
1476                rev = rte_intr_elist_index_get(intr_handle, efd_idx);
1477                if (__atomic_load_n(&rev->status,
1478                                __ATOMIC_RELAXED) == RTE_EPOLL_INVALID) {
1479                        RTE_LOG(INFO, EAL, "Event does not exist.\n");
1480                        return -EPERM;
1481                }
1482
1483                rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
1484                if (rc)
1485                        rc = -EPERM;
1486                break;
1487        default:
1488                RTE_LOG(ERR, EAL, "event op type mismatch\n");
1489                rc = -EPERM;
1490        }
1491
1492        return rc;
1493}
1494
1495void
1496rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
1497{
1498        uint32_t i;
1499        struct rte_epoll_event *rev;
1500
1501        for (i = 0; i < (uint32_t)rte_intr_nb_efd_get(intr_handle); i++) {
1502                rev = rte_intr_elist_index_get(intr_handle, i);
1503                if (__atomic_load_n(&rev->status,
1504                                __ATOMIC_RELAXED) == RTE_EPOLL_INVALID)
1505                        continue;
1506                if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
1507                        /* force free if the entry valid */
1508                        eal_epoll_data_safe_free(rev);
1509                }
1510        }
1511}
1512
1513int
1514rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
1515{
1516        uint32_t i;
1517        int fd;
1518        uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1519
1520        assert(nb_efd != 0);
1521
1522        if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VFIO_MSIX) {
1523                for (i = 0; i < n; i++) {
1524                        fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
1525                        if (fd < 0) {
1526                                RTE_LOG(ERR, EAL,
1527                                        "can't setup eventfd, error %i (%s)\n",
1528                                        errno, strerror(errno));
1529                                return -errno;
1530                        }
1531
1532                        if (rte_intr_efds_index_set(intr_handle, i, fd))
1533                                return -rte_errno;
1534                }
1535
1536                if (rte_intr_nb_efd_set(intr_handle, n))
1537                        return -rte_errno;
1538
1539                if (rte_intr_max_intr_set(intr_handle, NB_OTHER_INTR + n))
1540                        return -rte_errno;
1541        } else if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV) {
1542                /* only check, initialization would be done in vdev driver.*/
1543                if ((uint64_t)rte_intr_efd_counter_size_get(intr_handle) >
1544                    sizeof(union rte_intr_read_buffer)) {
1545                        RTE_LOG(ERR, EAL, "the efd_counter_size is oversized");
1546                        return -EINVAL;
1547                }
1548        } else {
1549                if (rte_intr_efds_index_set(intr_handle, 0, rte_intr_fd_get(intr_handle)))
1550                        return -rte_errno;
1551                if (rte_intr_nb_efd_set(intr_handle, RTE_MIN(nb_efd, 1U)))
1552                        return -rte_errno;
1553                if (rte_intr_max_intr_set(intr_handle, NB_OTHER_INTR))
1554                        return -rte_errno;
1555        }
1556
1557        return 0;
1558}
1559
1560void
1561rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
1562{
1563        uint32_t i;
1564
1565        rte_intr_free_epoll_fd(intr_handle);
1566        if (rte_intr_max_intr_get(intr_handle) > rte_intr_nb_efd_get(intr_handle)) {
1567                for (i = 0; i < (uint32_t)rte_intr_nb_efd_get(intr_handle); i++)
1568                        close(rte_intr_efds_index_get(intr_handle, i));
1569        }
1570        rte_intr_nb_efd_set(intr_handle, 0);
1571        rte_intr_max_intr_set(intr_handle, 0);
1572}
1573
1574int
1575rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
1576{
1577        return !(!rte_intr_nb_efd_get(intr_handle));
1578}
1579
1580int
1581rte_intr_allow_others(struct rte_intr_handle *intr_handle)
1582{
1583        if (!rte_intr_dp_is_en(intr_handle))
1584                return 1;
1585        else
1586                return !!(rte_intr_max_intr_get(intr_handle) -
1587                                rte_intr_nb_efd_get(intr_handle));
1588}
1589
1590int
1591rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
1592{
1593        if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VFIO_MSIX)
1594                return 1;
1595
1596        if (rte_intr_type_get(intr_handle) == RTE_INTR_HANDLE_VDEV)
1597                return 1;
1598
1599        return 0;
1600}
1601
1602int rte_thread_is_intr(void)
1603{
1604        return pthread_equal(intr_thread, pthread_self());
1605}
1606