linux/drivers/net/ethernet/sfc/rx_common.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/****************************************************************************
   3 * Driver for Solarflare network controllers and boards
   4 * Copyright 2018 Solarflare Communications Inc.
   5 *
   6 * This program is free software; you can redistribute it and/or modify it
   7 * under the terms of the GNU General Public License version 2 as published
   8 * by the Free Software Foundation, incorporated herein by reference.
   9 */
  10
  11#include "net_driver.h"
  12#include <linux/module.h>
  13#include <linux/iommu.h>
  14#include "efx.h"
  15#include "nic.h"
  16#include "rx_common.h"
  17
  18/* This is the percentage fill level below which new RX descriptors
  19 * will be added to the RX descriptor ring.
  20 */
  21static unsigned int rx_refill_threshold;
  22module_param(rx_refill_threshold, uint, 0444);
  23MODULE_PARM_DESC(rx_refill_threshold,
  24                 "RX descriptor ring refill threshold (%)");
  25
  26/* Number of RX buffers to recycle pages for.  When creating the RX page recycle
  27 * ring, this number is divided by the number of buffers per page to calculate
  28 * the number of pages to store in the RX page recycle ring.
  29 */
  30#define EFX_RECYCLE_RING_SIZE_IOMMU 4096
  31#define EFX_RECYCLE_RING_SIZE_NOIOMMU (2 * EFX_RX_PREFERRED_BATCH)
  32
  33/* RX maximum head room required.
  34 *
  35 * This must be at least 1 to prevent overflow, plus one packet-worth
  36 * to allow pipelined receives.
  37 */
  38#define EFX_RXD_HEAD_ROOM (1 + EFX_RX_MAX_FRAGS)
  39
  40/* Check the RX page recycle ring for a page that can be reused. */
  41static struct page *efx_reuse_page(struct efx_rx_queue *rx_queue)
  42{
  43        struct efx_nic *efx = rx_queue->efx;
  44        struct efx_rx_page_state *state;
  45        unsigned int index;
  46        struct page *page;
  47
  48        if (unlikely(!rx_queue->page_ring))
  49                return NULL;
  50        index = rx_queue->page_remove & rx_queue->page_ptr_mask;
  51        page = rx_queue->page_ring[index];
  52        if (page == NULL)
  53                return NULL;
  54
  55        rx_queue->page_ring[index] = NULL;
  56        /* page_remove cannot exceed page_add. */
  57        if (rx_queue->page_remove != rx_queue->page_add)
  58                ++rx_queue->page_remove;
  59
  60        /* If page_count is 1 then we hold the only reference to this page. */
  61        if (page_count(page) == 1) {
  62                ++rx_queue->page_recycle_count;
  63                return page;
  64        } else {
  65                state = page_address(page);
  66                dma_unmap_page(&efx->pci_dev->dev, state->dma_addr,
  67                               PAGE_SIZE << efx->rx_buffer_order,
  68                               DMA_FROM_DEVICE);
  69                put_page(page);
  70                ++rx_queue->page_recycle_failed;
  71        }
  72
  73        return NULL;
  74}
  75
  76/* Attempt to recycle the page if there is an RX recycle ring; the page can
  77 * only be added if this is the final RX buffer, to prevent pages being used in
  78 * the descriptor ring and appearing in the recycle ring simultaneously.
  79 */
  80static void efx_recycle_rx_page(struct efx_channel *channel,
  81                                struct efx_rx_buffer *rx_buf)
  82{
  83        struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel);
  84        struct efx_nic *efx = rx_queue->efx;
  85        struct page *page = rx_buf->page;
  86        unsigned int index;
  87
  88        /* Only recycle the page after processing the final buffer. */
  89        if (!(rx_buf->flags & EFX_RX_BUF_LAST_IN_PAGE))
  90                return;
  91
  92        index = rx_queue->page_add & rx_queue->page_ptr_mask;
  93        if (rx_queue->page_ring[index] == NULL) {
  94                unsigned int read_index = rx_queue->page_remove &
  95                        rx_queue->page_ptr_mask;
  96
  97                /* The next slot in the recycle ring is available, but
  98                 * increment page_remove if the read pointer currently
  99                 * points here.
 100                 */
 101                if (read_index == index)
 102                        ++rx_queue->page_remove;
 103                rx_queue->page_ring[index] = page;
 104                ++rx_queue->page_add;
 105                return;
 106        }
 107        ++rx_queue->page_recycle_full;
 108        efx_unmap_rx_buffer(efx, rx_buf);
 109        put_page(rx_buf->page);
 110}
 111
 112/* Recycle the pages that are used by buffers that have just been received. */
 113void efx_recycle_rx_pages(struct efx_channel *channel,
 114                          struct efx_rx_buffer *rx_buf,
 115                          unsigned int n_frags)
 116{
 117        struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel);
 118
 119        if (unlikely(!rx_queue->page_ring))
 120                return;
 121
 122        do {
 123                efx_recycle_rx_page(channel, rx_buf);
 124                rx_buf = efx_rx_buf_next(rx_queue, rx_buf);
 125        } while (--n_frags);
 126}
 127
 128void efx_discard_rx_packet(struct efx_channel *channel,
 129                           struct efx_rx_buffer *rx_buf,
 130                           unsigned int n_frags)
 131{
 132        struct efx_rx_queue *rx_queue = efx_channel_get_rx_queue(channel);
 133
 134        efx_recycle_rx_pages(channel, rx_buf, n_frags);
 135
 136        efx_free_rx_buffers(rx_queue, rx_buf, n_frags);
 137}
 138
 139static void efx_init_rx_recycle_ring(struct efx_rx_queue *rx_queue)
 140{
 141        unsigned int bufs_in_recycle_ring, page_ring_size;
 142        struct efx_nic *efx = rx_queue->efx;
 143
 144        /* Set the RX recycle ring size */
 145#ifdef CONFIG_PPC64
 146        bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_IOMMU;
 147#else
 148        if (iommu_present(&pci_bus_type))
 149                bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_IOMMU;
 150        else
 151                bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_NOIOMMU;
 152#endif /* CONFIG_PPC64 */
 153
 154        page_ring_size = roundup_pow_of_two(bufs_in_recycle_ring /
 155                                            efx->rx_bufs_per_page);
 156        rx_queue->page_ring = kcalloc(page_ring_size,
 157                                      sizeof(*rx_queue->page_ring), GFP_KERNEL);
 158        if (!rx_queue->page_ring)
 159                rx_queue->page_ptr_mask = 0;
 160        else
 161                rx_queue->page_ptr_mask = page_ring_size - 1;
 162}
 163
 164static void efx_fini_rx_recycle_ring(struct efx_rx_queue *rx_queue)
 165{
 166        struct efx_nic *efx = rx_queue->efx;
 167        int i;
 168
 169        /* Unmap and release the pages in the recycle ring. Remove the ring. */
 170        for (i = 0; i <= rx_queue->page_ptr_mask; i++) {
 171                struct page *page = rx_queue->page_ring[i];
 172                struct efx_rx_page_state *state;
 173
 174                if (page == NULL)
 175                        continue;
 176
 177                state = page_address(page);
 178                dma_unmap_page(&efx->pci_dev->dev, state->dma_addr,
 179                               PAGE_SIZE << efx->rx_buffer_order,
 180                               DMA_FROM_DEVICE);
 181                put_page(page);
 182        }
 183        kfree(rx_queue->page_ring);
 184        rx_queue->page_ring = NULL;
 185}
 186
 187static void efx_fini_rx_buffer(struct efx_rx_queue *rx_queue,
 188                               struct efx_rx_buffer *rx_buf)
 189{
 190        /* Release the page reference we hold for the buffer. */
 191        if (rx_buf->page)
 192                put_page(rx_buf->page);
 193
 194        /* If this is the last buffer in a page, unmap and free it. */
 195        if (rx_buf->flags & EFX_RX_BUF_LAST_IN_PAGE) {
 196                efx_unmap_rx_buffer(rx_queue->efx, rx_buf);
 197                efx_free_rx_buffers(rx_queue, rx_buf, 1);
 198        }
 199        rx_buf->page = NULL;
 200}
 201
 202int efx_probe_rx_queue(struct efx_rx_queue *rx_queue)
 203{
 204        struct efx_nic *efx = rx_queue->efx;
 205        unsigned int entries;
 206        int rc;
 207
 208        /* Create the smallest power-of-two aligned ring */
 209        entries = max(roundup_pow_of_two(efx->rxq_entries), EFX_MIN_DMAQ_SIZE);
 210        EFX_WARN_ON_PARANOID(entries > EFX_MAX_DMAQ_SIZE);
 211        rx_queue->ptr_mask = entries - 1;
 212
 213        netif_dbg(efx, probe, efx->net_dev,
 214                  "creating RX queue %d size %#x mask %#x\n",
 215                  efx_rx_queue_index(rx_queue), efx->rxq_entries,
 216                  rx_queue->ptr_mask);
 217
 218        /* Allocate RX buffers */
 219        rx_queue->buffer = kcalloc(entries, sizeof(*rx_queue->buffer),
 220                                   GFP_KERNEL);
 221        if (!rx_queue->buffer)
 222                return -ENOMEM;
 223
 224        rc = efx_nic_probe_rx(rx_queue);
 225        if (rc) {
 226                kfree(rx_queue->buffer);
 227                rx_queue->buffer = NULL;
 228        }
 229
 230        return rc;
 231}
 232
 233void efx_init_rx_queue(struct efx_rx_queue *rx_queue)
 234{
 235        unsigned int max_fill, trigger, max_trigger;
 236        struct efx_nic *efx = rx_queue->efx;
 237        int rc = 0;
 238
 239        netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev,
 240                  "initialising RX queue %d\n", efx_rx_queue_index(rx_queue));
 241
 242        /* Initialise ptr fields */
 243        rx_queue->added_count = 0;
 244        rx_queue->notified_count = 0;
 245        rx_queue->removed_count = 0;
 246        rx_queue->min_fill = -1U;
 247        efx_init_rx_recycle_ring(rx_queue);
 248
 249        rx_queue->page_remove = 0;
 250        rx_queue->page_add = rx_queue->page_ptr_mask + 1;
 251        rx_queue->page_recycle_count = 0;
 252        rx_queue->page_recycle_failed = 0;
 253        rx_queue->page_recycle_full = 0;
 254
 255        /* Initialise limit fields */
 256        max_fill = efx->rxq_entries - EFX_RXD_HEAD_ROOM;
 257        max_trigger =
 258                max_fill - efx->rx_pages_per_batch * efx->rx_bufs_per_page;
 259        if (rx_refill_threshold != 0) {
 260                trigger = max_fill * min(rx_refill_threshold, 100U) / 100U;
 261                if (trigger > max_trigger)
 262                        trigger = max_trigger;
 263        } else {
 264                trigger = max_trigger;
 265        }
 266
 267        rx_queue->max_fill = max_fill;
 268        rx_queue->fast_fill_trigger = trigger;
 269        rx_queue->refill_enabled = true;
 270
 271        /* Initialise XDP queue information */
 272        rc = xdp_rxq_info_reg(&rx_queue->xdp_rxq_info, efx->net_dev,
 273                              rx_queue->core_index, 0);
 274
 275        if (rc) {
 276                netif_err(efx, rx_err, efx->net_dev,
 277                          "Failure to initialise XDP queue information rc=%d\n",
 278                          rc);
 279                efx->xdp_rxq_info_failed = true;
 280        } else {
 281                rx_queue->xdp_rxq_info_valid = true;
 282        }
 283
 284        /* Set up RX descriptor ring */
 285        efx_nic_init_rx(rx_queue);
 286}
 287
 288void efx_fini_rx_queue(struct efx_rx_queue *rx_queue)
 289{
 290        struct efx_rx_buffer *rx_buf;
 291        int i;
 292
 293        netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev,
 294                  "shutting down RX queue %d\n", efx_rx_queue_index(rx_queue));
 295
 296        del_timer_sync(&rx_queue->slow_fill);
 297
 298        /* Release RX buffers from the current read ptr to the write ptr */
 299        if (rx_queue->buffer) {
 300                for (i = rx_queue->removed_count; i < rx_queue->added_count;
 301                     i++) {
 302                        unsigned int index = i & rx_queue->ptr_mask;
 303
 304                        rx_buf = efx_rx_buffer(rx_queue, index);
 305                        efx_fini_rx_buffer(rx_queue, rx_buf);
 306                }
 307        }
 308
 309        efx_fini_rx_recycle_ring(rx_queue);
 310
 311        if (rx_queue->xdp_rxq_info_valid)
 312                xdp_rxq_info_unreg(&rx_queue->xdp_rxq_info);
 313
 314        rx_queue->xdp_rxq_info_valid = false;
 315}
 316
 317void efx_remove_rx_queue(struct efx_rx_queue *rx_queue)
 318{
 319        netif_dbg(rx_queue->efx, drv, rx_queue->efx->net_dev,
 320                  "destroying RX queue %d\n", efx_rx_queue_index(rx_queue));
 321
 322        efx_nic_remove_rx(rx_queue);
 323
 324        kfree(rx_queue->buffer);
 325        rx_queue->buffer = NULL;
 326}
 327
 328/* Unmap a DMA-mapped page.  This function is only called for the final RX
 329 * buffer in a page.
 330 */
 331void efx_unmap_rx_buffer(struct efx_nic *efx,
 332                         struct efx_rx_buffer *rx_buf)
 333{
 334        struct page *page = rx_buf->page;
 335
 336        if (page) {
 337                struct efx_rx_page_state *state = page_address(page);
 338
 339                dma_unmap_page(&efx->pci_dev->dev,
 340                               state->dma_addr,
 341                               PAGE_SIZE << efx->rx_buffer_order,
 342                               DMA_FROM_DEVICE);
 343        }
 344}
 345
 346void efx_free_rx_buffers(struct efx_rx_queue *rx_queue,
 347                         struct efx_rx_buffer *rx_buf,
 348                         unsigned int num_bufs)
 349{
 350        do {
 351                if (rx_buf->page) {
 352                        put_page(rx_buf->page);
 353                        rx_buf->page = NULL;
 354                }
 355                rx_buf = efx_rx_buf_next(rx_queue, rx_buf);
 356        } while (--num_bufs);
 357}
 358
 359void efx_rx_slow_fill(struct timer_list *t)
 360{
 361        struct efx_rx_queue *rx_queue = from_timer(rx_queue, t, slow_fill);
 362
 363        /* Post an event to cause NAPI to run and refill the queue */
 364        efx_nic_generate_fill_event(rx_queue);
 365        ++rx_queue->slow_fill_count;
 366}
 367
 368void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue)
 369{
 370        mod_timer(&rx_queue->slow_fill, jiffies + msecs_to_jiffies(10));
 371}
 372
 373/* efx_init_rx_buffers - create EFX_RX_BATCH page-based RX buffers
 374 *
 375 * @rx_queue:           Efx RX queue
 376 *
 377 * This allocates a batch of pages, maps them for DMA, and populates
 378 * struct efx_rx_buffers for each one. Return a negative error code or
 379 * 0 on success. If a single page can be used for multiple buffers,
 380 * then the page will either be inserted fully, or not at all.
 381 */
 382static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue, bool atomic)
 383{
 384        unsigned int page_offset, index, count;
 385        struct efx_nic *efx = rx_queue->efx;
 386        struct efx_rx_page_state *state;
 387        struct efx_rx_buffer *rx_buf;
 388        dma_addr_t dma_addr;
 389        struct page *page;
 390
 391        count = 0;
 392        do {
 393                page = efx_reuse_page(rx_queue);
 394                if (page == NULL) {
 395                        page = alloc_pages(__GFP_COMP |
 396                                           (atomic ? GFP_ATOMIC : GFP_KERNEL),
 397                                           efx->rx_buffer_order);
 398                        if (unlikely(page == NULL))
 399                                return -ENOMEM;
 400                        dma_addr =
 401                                dma_map_page(&efx->pci_dev->dev, page, 0,
 402                                             PAGE_SIZE << efx->rx_buffer_order,
 403                                             DMA_FROM_DEVICE);
 404                        if (unlikely(dma_mapping_error(&efx->pci_dev->dev,
 405                                                       dma_addr))) {
 406                                __free_pages(page, efx->rx_buffer_order);
 407                                return -EIO;
 408                        }
 409                        state = page_address(page);
 410                        state->dma_addr = dma_addr;
 411                } else {
 412                        state = page_address(page);
 413                        dma_addr = state->dma_addr;
 414                }
 415
 416                dma_addr += sizeof(struct efx_rx_page_state);
 417                page_offset = sizeof(struct efx_rx_page_state);
 418
 419                do {
 420                        index = rx_queue->added_count & rx_queue->ptr_mask;
 421                        rx_buf = efx_rx_buffer(rx_queue, index);
 422                        rx_buf->dma_addr = dma_addr + efx->rx_ip_align +
 423                                           EFX_XDP_HEADROOM;
 424                        rx_buf->page = page;
 425                        rx_buf->page_offset = page_offset + efx->rx_ip_align +
 426                                              EFX_XDP_HEADROOM;
 427                        rx_buf->len = efx->rx_dma_len;
 428                        rx_buf->flags = 0;
 429                        ++rx_queue->added_count;
 430                        get_page(page);
 431                        dma_addr += efx->rx_page_buf_step;
 432                        page_offset += efx->rx_page_buf_step;
 433                } while (page_offset + efx->rx_page_buf_step <= PAGE_SIZE);
 434
 435                rx_buf->flags = EFX_RX_BUF_LAST_IN_PAGE;
 436        } while (++count < efx->rx_pages_per_batch);
 437
 438        return 0;
 439}
 440
 441void efx_rx_config_page_split(struct efx_nic *efx)
 442{
 443        efx->rx_page_buf_step = ALIGN(efx->rx_dma_len + efx->rx_ip_align +
 444                                      EFX_XDP_HEADROOM + EFX_XDP_TAILROOM,
 445                                      EFX_RX_BUF_ALIGNMENT);
 446        efx->rx_bufs_per_page = efx->rx_buffer_order ? 1 :
 447                ((PAGE_SIZE - sizeof(struct efx_rx_page_state)) /
 448                efx->rx_page_buf_step);
 449        efx->rx_buffer_truesize = (PAGE_SIZE << efx->rx_buffer_order) /
 450                efx->rx_bufs_per_page;
 451        efx->rx_pages_per_batch = DIV_ROUND_UP(EFX_RX_PREFERRED_BATCH,
 452                                               efx->rx_bufs_per_page);
 453}
 454
 455/* efx_fast_push_rx_descriptors - push new RX descriptors quickly
 456 * @rx_queue:           RX descriptor queue
 457 *
 458 * This will aim to fill the RX descriptor queue up to
 459 * @rx_queue->@max_fill. If there is insufficient atomic
 460 * memory to do so, a slow fill will be scheduled.
 461 *
 462 * The caller must provide serialisation (none is used here). In practise,
 463 * this means this function must run from the NAPI handler, or be called
 464 * when NAPI is disabled.
 465 */
 466void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue, bool atomic)
 467{
 468        struct efx_nic *efx = rx_queue->efx;
 469        unsigned int fill_level, batch_size;
 470        int space, rc = 0;
 471
 472        if (!rx_queue->refill_enabled)
 473                return;
 474
 475        /* Calculate current fill level, and exit if we don't need to fill */
 476        fill_level = (rx_queue->added_count - rx_queue->removed_count);
 477        EFX_WARN_ON_ONCE_PARANOID(fill_level > rx_queue->efx->rxq_entries);
 478        if (fill_level >= rx_queue->fast_fill_trigger)
 479                goto out;
 480
 481        /* Record minimum fill level */
 482        if (unlikely(fill_level < rx_queue->min_fill)) {
 483                if (fill_level)
 484                        rx_queue->min_fill = fill_level;
 485        }
 486
 487        batch_size = efx->rx_pages_per_batch * efx->rx_bufs_per_page;
 488        space = rx_queue->max_fill - fill_level;
 489        EFX_WARN_ON_ONCE_PARANOID(space < batch_size);
 490
 491        netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev,
 492                   "RX queue %d fast-filling descriptor ring from"
 493                   " level %d to level %d\n",
 494                   efx_rx_queue_index(rx_queue), fill_level,
 495                   rx_queue->max_fill);
 496
 497        do {
 498                rc = efx_init_rx_buffers(rx_queue, atomic);
 499                if (unlikely(rc)) {
 500                        /* Ensure that we don't leave the rx queue empty */
 501                        efx_schedule_slow_fill(rx_queue);
 502                        goto out;
 503                }
 504        } while ((space -= batch_size) >= batch_size);
 505
 506        netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev,
 507                   "RX queue %d fast-filled descriptor ring "
 508                   "to level %d\n", efx_rx_queue_index(rx_queue),
 509                   rx_queue->added_count - rx_queue->removed_count);
 510
 511 out:
 512        if (rx_queue->notified_count != rx_queue->added_count)
 513                efx_nic_notify_rx_desc(rx_queue);
 514}
 515
 516/* Pass a received packet up through GRO.  GRO can handle pages
 517 * regardless of checksum state and skbs with a good checksum.
 518 */
 519void
 520efx_rx_packet_gro(struct efx_channel *channel, struct efx_rx_buffer *rx_buf,
 521                  unsigned int n_frags, u8 *eh, __wsum csum)
 522{
 523        struct napi_struct *napi = &channel->napi_str;
 524        struct efx_nic *efx = channel->efx;
 525        struct sk_buff *skb;
 526
 527        skb = napi_get_frags(napi);
 528        if (unlikely(!skb)) {
 529                struct efx_rx_queue *rx_queue;
 530
 531                rx_queue = efx_channel_get_rx_queue(channel);
 532                efx_free_rx_buffers(rx_queue, rx_buf, n_frags);
 533                return;
 534        }
 535
 536        if (efx->net_dev->features & NETIF_F_RXHASH &&
 537            efx_rx_buf_hash_valid(efx, eh))
 538                skb_set_hash(skb, efx_rx_buf_hash(efx, eh),
 539                             PKT_HASH_TYPE_L3);
 540        if (csum) {
 541                skb->csum = csum;
 542                skb->ip_summed = CHECKSUM_COMPLETE;
 543        } else {
 544                skb->ip_summed = ((rx_buf->flags & EFX_RX_PKT_CSUMMED) ?
 545                                  CHECKSUM_UNNECESSARY : CHECKSUM_NONE);
 546        }
 547        skb->csum_level = !!(rx_buf->flags & EFX_RX_PKT_CSUM_LEVEL);
 548
 549        for (;;) {
 550                skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags,
 551                                   rx_buf->page, rx_buf->page_offset,
 552                                   rx_buf->len);
 553                rx_buf->page = NULL;
 554                skb->len += rx_buf->len;
 555                if (skb_shinfo(skb)->nr_frags == n_frags)
 556                        break;
 557
 558                rx_buf = efx_rx_buf_next(&channel->rx_queue, rx_buf);
 559        }
 560
 561        skb->data_len = skb->len;
 562        skb->truesize += n_frags * efx->rx_buffer_truesize;
 563
 564        skb_record_rx_queue(skb, channel->rx_queue.core_index);
 565
 566        napi_gro_frags(napi);
 567}
 568
 569/* RSS contexts.  We're using linked lists and crappy O(n) algorithms, because
 570 * (a) this is an infrequent control-plane operation and (b) n is small (max 64)
 571 */
 572struct efx_rss_context *efx_alloc_rss_context_entry(struct efx_nic *efx)
 573{
 574        struct list_head *head = &efx->rss_context.list;
 575        struct efx_rss_context *ctx, *new;
 576        u32 id = 1; /* Don't use zero, that refers to the master RSS context */
 577
 578        WARN_ON(!mutex_is_locked(&efx->rss_lock));
 579
 580        /* Search for first gap in the numbering */
 581        list_for_each_entry(ctx, head, list) {
 582                if (ctx->user_id != id)
 583                        break;
 584                id++;
 585                /* Check for wrap.  If this happens, we have nearly 2^32
 586                 * allocated RSS contexts, which seems unlikely.
 587                 */
 588                if (WARN_ON_ONCE(!id))
 589                        return NULL;
 590        }
 591
 592        /* Create the new entry */
 593        new = kmalloc(sizeof(*new), GFP_KERNEL);
 594        if (!new)
 595                return NULL;
 596        new->context_id = EFX_MCDI_RSS_CONTEXT_INVALID;
 597        new->rx_hash_udp_4tuple = false;
 598
 599        /* Insert the new entry into the gap */
 600        new->user_id = id;
 601        list_add_tail(&new->list, &ctx->list);
 602        return new;
 603}
 604
 605struct efx_rss_context *efx_find_rss_context_entry(struct efx_nic *efx, u32 id)
 606{
 607        struct list_head *head = &efx->rss_context.list;
 608        struct efx_rss_context *ctx;
 609
 610        WARN_ON(!mutex_is_locked(&efx->rss_lock));
 611
 612        list_for_each_entry(ctx, head, list)
 613                if (ctx->user_id == id)
 614                        return ctx;
 615        return NULL;
 616}
 617
 618void efx_free_rss_context_entry(struct efx_rss_context *ctx)
 619{
 620        list_del(&ctx->list);
 621        kfree(ctx);
 622}
 623
 624void efx_set_default_rx_indir_table(struct efx_nic *efx,
 625                                    struct efx_rss_context *ctx)
 626{
 627        size_t i;
 628
 629        for (i = 0; i < ARRAY_SIZE(ctx->rx_indir_table); i++)
 630                ctx->rx_indir_table[i] =
 631                        ethtool_rxfh_indir_default(i, efx->rss_spread);
 632}
 633
 634/**
 635 * efx_filter_is_mc_recipient - test whether spec is a multicast recipient
 636 * @spec: Specification to test
 637 *
 638 * Return: %true if the specification is a non-drop RX filter that
 639 * matches a local MAC address I/G bit value of 1 or matches a local
 640 * IPv4 or IPv6 address value in the respective multicast address
 641 * range.  Otherwise %false.
 642 */
 643bool efx_filter_is_mc_recipient(const struct efx_filter_spec *spec)
 644{
 645        if (!(spec->flags & EFX_FILTER_FLAG_RX) ||
 646            spec->dmaq_id == EFX_FILTER_RX_DMAQ_ID_DROP)
 647                return false;
 648
 649        if (spec->match_flags &
 650            (EFX_FILTER_MATCH_LOC_MAC | EFX_FILTER_MATCH_LOC_MAC_IG) &&
 651            is_multicast_ether_addr(spec->loc_mac))
 652                return true;
 653
 654        if ((spec->match_flags &
 655             (EFX_FILTER_MATCH_ETHER_TYPE | EFX_FILTER_MATCH_LOC_HOST)) ==
 656            (EFX_FILTER_MATCH_ETHER_TYPE | EFX_FILTER_MATCH_LOC_HOST)) {
 657                if (spec->ether_type == htons(ETH_P_IP) &&
 658                    ipv4_is_multicast(spec->loc_host[0]))
 659                        return true;
 660                if (spec->ether_type == htons(ETH_P_IPV6) &&
 661                    ((const u8 *)spec->loc_host)[0] == 0xff)
 662                        return true;
 663        }
 664
 665        return false;
 666}
 667
 668bool efx_filter_spec_equal(const struct efx_filter_spec *left,
 669                           const struct efx_filter_spec *right)
 670{
 671        if ((left->match_flags ^ right->match_flags) |
 672            ((left->flags ^ right->flags) &
 673             (EFX_FILTER_FLAG_RX | EFX_FILTER_FLAG_TX)))
 674                return false;
 675
 676        return memcmp(&left->outer_vid, &right->outer_vid,
 677                      sizeof(struct efx_filter_spec) -
 678                      offsetof(struct efx_filter_spec, outer_vid)) == 0;
 679}
 680
 681u32 efx_filter_spec_hash(const struct efx_filter_spec *spec)
 682{
 683        BUILD_BUG_ON(offsetof(struct efx_filter_spec, outer_vid) & 3);
 684        return jhash2((const u32 *)&spec->outer_vid,
 685                      (sizeof(struct efx_filter_spec) -
 686                       offsetof(struct efx_filter_spec, outer_vid)) / 4,
 687                      0);
 688}
 689
 690#ifdef CONFIG_RFS_ACCEL
 691bool efx_rps_check_rule(struct efx_arfs_rule *rule, unsigned int filter_idx,
 692                        bool *force)
 693{
 694        if (rule->filter_id == EFX_ARFS_FILTER_ID_PENDING) {
 695                /* ARFS is currently updating this entry, leave it */
 696                return false;
 697        }
 698        if (rule->filter_id == EFX_ARFS_FILTER_ID_ERROR) {
 699                /* ARFS tried and failed to update this, so it's probably out
 700                 * of date.  Remove the filter and the ARFS rule entry.
 701                 */
 702                rule->filter_id = EFX_ARFS_FILTER_ID_REMOVING;
 703                *force = true;
 704                return true;
 705        } else if (WARN_ON(rule->filter_id != filter_idx)) { /* can't happen */
 706                /* ARFS has moved on, so old filter is not needed.  Since we did
 707                 * not mark the rule with EFX_ARFS_FILTER_ID_REMOVING, it will
 708                 * not be removed by efx_rps_hash_del() subsequently.
 709                 */
 710                *force = true;
 711                return true;
 712        }
 713        /* Remove it iff ARFS wants to. */
 714        return true;
 715}
 716
 717static
 718struct hlist_head *efx_rps_hash_bucket(struct efx_nic *efx,
 719                                       const struct efx_filter_spec *spec)
 720{
 721        u32 hash = efx_filter_spec_hash(spec);
 722
 723        lockdep_assert_held(&efx->rps_hash_lock);
 724        if (!efx->rps_hash_table)
 725                return NULL;
 726        return &efx->rps_hash_table[hash % EFX_ARFS_HASH_TABLE_SIZE];
 727}
 728
 729struct efx_arfs_rule *efx_rps_hash_find(struct efx_nic *efx,
 730                                        const struct efx_filter_spec *spec)
 731{
 732        struct efx_arfs_rule *rule;
 733        struct hlist_head *head;
 734        struct hlist_node *node;
 735
 736        head = efx_rps_hash_bucket(efx, spec);
 737        if (!head)
 738                return NULL;
 739        hlist_for_each(node, head) {
 740                rule = container_of(node, struct efx_arfs_rule, node);
 741                if (efx_filter_spec_equal(spec, &rule->spec))
 742                        return rule;
 743        }
 744        return NULL;
 745}
 746
 747struct efx_arfs_rule *efx_rps_hash_add(struct efx_nic *efx,
 748                                       const struct efx_filter_spec *spec,
 749                                       bool *new)
 750{
 751        struct efx_arfs_rule *rule;
 752        struct hlist_head *head;
 753        struct hlist_node *node;
 754
 755        head = efx_rps_hash_bucket(efx, spec);
 756        if (!head)
 757                return NULL;
 758        hlist_for_each(node, head) {
 759                rule = container_of(node, struct efx_arfs_rule, node);
 760                if (efx_filter_spec_equal(spec, &rule->spec)) {
 761                        *new = false;
 762                        return rule;
 763                }
 764        }
 765        rule = kmalloc(sizeof(*rule), GFP_ATOMIC);
 766        *new = true;
 767        if (rule) {
 768                memcpy(&rule->spec, spec, sizeof(rule->spec));
 769                hlist_add_head(&rule->node, head);
 770        }
 771        return rule;
 772}
 773
 774void efx_rps_hash_del(struct efx_nic *efx, const struct efx_filter_spec *spec)
 775{
 776        struct efx_arfs_rule *rule;
 777        struct hlist_head *head;
 778        struct hlist_node *node;
 779
 780        head = efx_rps_hash_bucket(efx, spec);
 781        if (WARN_ON(!head))
 782                return;
 783        hlist_for_each(node, head) {
 784                rule = container_of(node, struct efx_arfs_rule, node);
 785                if (efx_filter_spec_equal(spec, &rule->spec)) {
 786                        /* Someone already reused the entry.  We know that if
 787                         * this check doesn't fire (i.e. filter_id == REMOVING)
 788                         * then the REMOVING mark was put there by our caller,
 789                         * because caller is holding a lock on filter table and
 790                         * only holders of that lock set REMOVING.
 791                         */
 792                        if (rule->filter_id != EFX_ARFS_FILTER_ID_REMOVING)
 793                                return;
 794                        hlist_del(node);
 795                        kfree(rule);
 796                        return;
 797                }
 798        }
 799        /* We didn't find it. */
 800        WARN_ON(1);
 801}
 802#endif
 803
 804int efx_probe_filters(struct efx_nic *efx)
 805{
 806        int rc;
 807
 808        mutex_lock(&efx->mac_lock);
 809        down_write(&efx->filter_sem);
 810        rc = efx->type->filter_table_probe(efx);
 811        if (rc)
 812                goto out_unlock;
 813
 814#ifdef CONFIG_RFS_ACCEL
 815        if (efx->type->offload_features & NETIF_F_NTUPLE) {
 816                struct efx_channel *channel;
 817                int i, success = 1;
 818
 819                efx_for_each_channel(channel, efx) {
 820                        channel->rps_flow_id =
 821                                kcalloc(efx->type->max_rx_ip_filters,
 822                                        sizeof(*channel->rps_flow_id),
 823                                        GFP_KERNEL);
 824                        if (!channel->rps_flow_id)
 825                                success = 0;
 826                        else
 827                                for (i = 0;
 828                                     i < efx->type->max_rx_ip_filters;
 829                                     ++i)
 830                                        channel->rps_flow_id[i] =
 831                                                RPS_FLOW_ID_INVALID;
 832                        channel->rfs_expire_index = 0;
 833                        channel->rfs_filter_count = 0;
 834                }
 835
 836                if (!success) {
 837                        efx_for_each_channel(channel, efx)
 838                                kfree(channel->rps_flow_id);
 839                        efx->type->filter_table_remove(efx);
 840                        rc = -ENOMEM;
 841                        goto out_unlock;
 842                }
 843        }
 844#endif
 845out_unlock:
 846        up_write(&efx->filter_sem);
 847        mutex_unlock(&efx->mac_lock);
 848        return rc;
 849}
 850
 851void efx_remove_filters(struct efx_nic *efx)
 852{
 853#ifdef CONFIG_RFS_ACCEL
 854        struct efx_channel *channel;
 855
 856        efx_for_each_channel(channel, efx) {
 857                cancel_delayed_work_sync(&channel->filter_work);
 858                kfree(channel->rps_flow_id);
 859                channel->rps_flow_id = NULL;
 860        }
 861#endif
 862        down_write(&efx->filter_sem);
 863        efx->type->filter_table_remove(efx);
 864        up_write(&efx->filter_sem);
 865}
 866
 867#ifdef CONFIG_RFS_ACCEL
 868
 869static void efx_filter_rfs_work(struct work_struct *data)
 870{
 871        struct efx_async_filter_insertion *req = container_of(data, struct efx_async_filter_insertion,
 872                                                              work);
 873        struct efx_nic *efx = netdev_priv(req->net_dev);
 874        struct efx_channel *channel = efx_get_channel(efx, req->rxq_index);
 875        int slot_idx = req - efx->rps_slot;
 876        struct efx_arfs_rule *rule;
 877        u16 arfs_id = 0;
 878        int rc;
 879
 880        rc = efx->type->filter_insert(efx, &req->spec, true);
 881        if (rc >= 0)
 882                /* Discard 'priority' part of EF10+ filter ID (mcdi_filters) */
 883                rc %= efx->type->max_rx_ip_filters;
 884        if (efx->rps_hash_table) {
 885                spin_lock_bh(&efx->rps_hash_lock);
 886                rule = efx_rps_hash_find(efx, &req->spec);
 887                /* The rule might have already gone, if someone else's request
 888                 * for the same spec was already worked and then expired before
 889                 * we got around to our work.  In that case we have nothing
 890                 * tying us to an arfs_id, meaning that as soon as the filter
 891                 * is considered for expiry it will be removed.
 892                 */
 893                if (rule) {
 894                        if (rc < 0)
 895                                rule->filter_id = EFX_ARFS_FILTER_ID_ERROR;
 896                        else
 897                                rule->filter_id = rc;
 898                        arfs_id = rule->arfs_id;
 899                }
 900                spin_unlock_bh(&efx->rps_hash_lock);
 901        }
 902        if (rc >= 0) {
 903                /* Remember this so we can check whether to expire the filter
 904                 * later.
 905                 */
 906                mutex_lock(&efx->rps_mutex);
 907                if (channel->rps_flow_id[rc] == RPS_FLOW_ID_INVALID)
 908                        channel->rfs_filter_count++;
 909                channel->rps_flow_id[rc] = req->flow_id;
 910                mutex_unlock(&efx->rps_mutex);
 911
 912                if (req->spec.ether_type == htons(ETH_P_IP))
 913                        netif_info(efx, rx_status, efx->net_dev,
 914                                   "steering %s %pI4:%u:%pI4:%u to queue %u [flow %u filter %d id %u]\n",
 915                                   (req->spec.ip_proto == IPPROTO_TCP) ? "TCP" : "UDP",
 916                                   req->spec.rem_host, ntohs(req->spec.rem_port),
 917                                   req->spec.loc_host, ntohs(req->spec.loc_port),
 918                                   req->rxq_index, req->flow_id, rc, arfs_id);
 919                else
 920                        netif_info(efx, rx_status, efx->net_dev,
 921                                   "steering %s [%pI6]:%u:[%pI6]:%u to queue %u [flow %u filter %d id %u]\n",
 922                                   (req->spec.ip_proto == IPPROTO_TCP) ? "TCP" : "UDP",
 923                                   req->spec.rem_host, ntohs(req->spec.rem_port),
 924                                   req->spec.loc_host, ntohs(req->spec.loc_port),
 925                                   req->rxq_index, req->flow_id, rc, arfs_id);
 926                channel->n_rfs_succeeded++;
 927        } else {
 928                if (req->spec.ether_type == htons(ETH_P_IP))
 929                        netif_dbg(efx, rx_status, efx->net_dev,
 930                                  "failed to steer %s %pI4:%u:%pI4:%u to queue %u [flow %u rc %d id %u]\n",
 931                                  (req->spec.ip_proto == IPPROTO_TCP) ? "TCP" : "UDP",
 932                                  req->spec.rem_host, ntohs(req->spec.rem_port),
 933                                  req->spec.loc_host, ntohs(req->spec.loc_port),
 934                                  req->rxq_index, req->flow_id, rc, arfs_id);
 935                else
 936                        netif_dbg(efx, rx_status, efx->net_dev,
 937                                  "failed to steer %s [%pI6]:%u:[%pI6]:%u to queue %u [flow %u rc %d id %u]\n",
 938                                  (req->spec.ip_proto == IPPROTO_TCP) ? "TCP" : "UDP",
 939                                  req->spec.rem_host, ntohs(req->spec.rem_port),
 940                                  req->spec.loc_host, ntohs(req->spec.loc_port),
 941                                  req->rxq_index, req->flow_id, rc, arfs_id);
 942                channel->n_rfs_failed++;
 943                /* We're overloading the NIC's filter tables, so let's do a
 944                 * chunk of extra expiry work.
 945                 */
 946                __efx_filter_rfs_expire(channel, min(channel->rfs_filter_count,
 947                                                     100u));
 948        }
 949
 950        /* Release references */
 951        clear_bit(slot_idx, &efx->rps_slot_map);
 952        dev_put(req->net_dev);
 953}
 954
 955int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
 956                   u16 rxq_index, u32 flow_id)
 957{
 958        struct efx_nic *efx = netdev_priv(net_dev);
 959        struct efx_async_filter_insertion *req;
 960        struct efx_arfs_rule *rule;
 961        struct flow_keys fk;
 962        int slot_idx;
 963        bool new;
 964        int rc;
 965
 966        /* find a free slot */
 967        for (slot_idx = 0; slot_idx < EFX_RPS_MAX_IN_FLIGHT; slot_idx++)
 968                if (!test_and_set_bit(slot_idx, &efx->rps_slot_map))
 969                        break;
 970        if (slot_idx >= EFX_RPS_MAX_IN_FLIGHT)
 971                return -EBUSY;
 972
 973        if (flow_id == RPS_FLOW_ID_INVALID) {
 974                rc = -EINVAL;
 975                goto out_clear;
 976        }
 977
 978        if (!skb_flow_dissect_flow_keys(skb, &fk, 0)) {
 979                rc = -EPROTONOSUPPORT;
 980                goto out_clear;
 981        }
 982
 983        if (fk.basic.n_proto != htons(ETH_P_IP) && fk.basic.n_proto != htons(ETH_P_IPV6)) {
 984                rc = -EPROTONOSUPPORT;
 985                goto out_clear;
 986        }
 987        if (fk.control.flags & FLOW_DIS_IS_FRAGMENT) {
 988                rc = -EPROTONOSUPPORT;
 989                goto out_clear;
 990        }
 991
 992        req = efx->rps_slot + slot_idx;
 993        efx_filter_init_rx(&req->spec, EFX_FILTER_PRI_HINT,
 994                           efx->rx_scatter ? EFX_FILTER_FLAG_RX_SCATTER : 0,
 995                           rxq_index);
 996        req->spec.match_flags =
 997                EFX_FILTER_MATCH_ETHER_TYPE | EFX_FILTER_MATCH_IP_PROTO |
 998                EFX_FILTER_MATCH_LOC_HOST | EFX_FILTER_MATCH_LOC_PORT |
 999                EFX_FILTER_MATCH_REM_HOST | EFX_FILTER_MATCH_REM_PORT;
1000        req->spec.ether_type = fk.basic.n_proto;
1001        req->spec.ip_proto = fk.basic.ip_proto;
1002
1003        if (fk.basic.n_proto == htons(ETH_P_IP)) {
1004                req->spec.rem_host[0] = fk.addrs.v4addrs.src;
1005                req->spec.loc_host[0] = fk.addrs.v4addrs.dst;
1006        } else {
1007                memcpy(req->spec.rem_host, &fk.addrs.v6addrs.src,
1008                       sizeof(struct in6_addr));
1009                memcpy(req->spec.loc_host, &fk.addrs.v6addrs.dst,
1010                       sizeof(struct in6_addr));
1011        }
1012
1013        req->spec.rem_port = fk.ports.src;
1014        req->spec.loc_port = fk.ports.dst;
1015
1016        if (efx->rps_hash_table) {
1017                /* Add it to ARFS hash table */
1018                spin_lock(&efx->rps_hash_lock);
1019                rule = efx_rps_hash_add(efx, &req->spec, &new);
1020                if (!rule) {
1021                        rc = -ENOMEM;
1022                        goto out_unlock;
1023                }
1024                if (new)
1025                        rule->arfs_id = efx->rps_next_id++ % RPS_NO_FILTER;
1026                rc = rule->arfs_id;
1027                /* Skip if existing or pending filter already does the right thing */
1028                if (!new && rule->rxq_index == rxq_index &&
1029                    rule->filter_id >= EFX_ARFS_FILTER_ID_PENDING)
1030                        goto out_unlock;
1031                rule->rxq_index = rxq_index;
1032                rule->filter_id = EFX_ARFS_FILTER_ID_PENDING;
1033                spin_unlock(&efx->rps_hash_lock);
1034        } else {
1035                /* Without an ARFS hash table, we just use arfs_id 0 for all
1036                 * filters.  This means if multiple flows hash to the same
1037                 * flow_id, all but the most recently touched will be eligible
1038                 * for expiry.
1039                 */
1040                rc = 0;
1041        }
1042
1043        /* Queue the request */
1044        dev_hold(req->net_dev = net_dev);
1045        INIT_WORK(&req->work, efx_filter_rfs_work);
1046        req->rxq_index = rxq_index;
1047        req->flow_id = flow_id;
1048        schedule_work(&req->work);
1049        return rc;
1050out_unlock:
1051        spin_unlock(&efx->rps_hash_lock);
1052out_clear:
1053        clear_bit(slot_idx, &efx->rps_slot_map);
1054        return rc;
1055}
1056
1057bool __efx_filter_rfs_expire(struct efx_channel *channel, unsigned int quota)
1058{
1059        bool (*expire_one)(struct efx_nic *efx, u32 flow_id, unsigned int index);
1060        struct efx_nic *efx = channel->efx;
1061        unsigned int index, size, start;
1062        u32 flow_id;
1063
1064        if (!mutex_trylock(&efx->rps_mutex))
1065                return false;
1066        expire_one = efx->type->filter_rfs_expire_one;
1067        index = channel->rfs_expire_index;
1068        start = index;
1069        size = efx->type->max_rx_ip_filters;
1070        while (quota) {
1071                flow_id = channel->rps_flow_id[index];
1072
1073                if (flow_id != RPS_FLOW_ID_INVALID) {
1074                        quota--;
1075                        if (expire_one(efx, flow_id, index)) {
1076                                netif_info(efx, rx_status, efx->net_dev,
1077                                           "expired filter %d [channel %u flow %u]\n",
1078                                           index, channel->channel, flow_id);
1079                                channel->rps_flow_id[index] = RPS_FLOW_ID_INVALID;
1080                                channel->rfs_filter_count--;
1081                        }
1082                }
1083                if (++index == size)
1084                        index = 0;
1085                /* If we were called with a quota that exceeds the total number
1086                 * of filters in the table (which shouldn't happen, but could
1087                 * if two callers race), ensure that we don't loop forever -
1088                 * stop when we've examined every row of the table.
1089                 */
1090                if (index == start)
1091                        break;
1092        }
1093
1094        channel->rfs_expire_index = index;
1095        mutex_unlock(&efx->rps_mutex);
1096        return true;
1097}
1098
1099#endif /* CONFIG_RFS_ACCEL */
1100