linux/drivers/net/ethernet/intel/ice/ice_txrx.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/* Copyright (c) 2018, Intel Corporation. */
   3
   4/* The driver transmit and receive code */
   5
   6#include <linux/mm.h>
   7#include <linux/netdevice.h>
   8#include <linux/prefetch.h>
   9#include <linux/bpf_trace.h>
  10#include <net/dsfield.h>
  11#include <net/xdp.h>
  12#include "ice_txrx_lib.h"
  13#include "ice_lib.h"
  14#include "ice.h"
  15#include "ice_trace.h"
  16#include "ice_dcb_lib.h"
  17#include "ice_xsk.h"
  18#include "ice_eswitch.h"
  19
  20#define ICE_RX_HDR_SIZE         256
  21
  22#define FDIR_DESC_RXDID 0x40
  23#define ICE_FDIR_CLEAN_DELAY 10
  24
  25/**
  26 * ice_prgm_fdir_fltr - Program a Flow Director filter
  27 * @vsi: VSI to send dummy packet
  28 * @fdir_desc: flow director descriptor
  29 * @raw_packet: allocated buffer for flow director
  30 */
  31int
  32ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc,
  33                   u8 *raw_packet)
  34{
  35        struct ice_tx_buf *tx_buf, *first;
  36        struct ice_fltr_desc *f_desc;
  37        struct ice_tx_desc *tx_desc;
  38        struct ice_tx_ring *tx_ring;
  39        struct device *dev;
  40        dma_addr_t dma;
  41        u32 td_cmd;
  42        u16 i;
  43
  44        /* VSI and Tx ring */
  45        if (!vsi)
  46                return -ENOENT;
  47        tx_ring = vsi->tx_rings[0];
  48        if (!tx_ring || !tx_ring->desc)
  49                return -ENOENT;
  50        dev = tx_ring->dev;
  51
  52        /* we are using two descriptors to add/del a filter and we can wait */
  53        for (i = ICE_FDIR_CLEAN_DELAY; ICE_DESC_UNUSED(tx_ring) < 2; i--) {
  54                if (!i)
  55                        return -EAGAIN;
  56                msleep_interruptible(1);
  57        }
  58
  59        dma = dma_map_single(dev, raw_packet, ICE_FDIR_MAX_RAW_PKT_SIZE,
  60                             DMA_TO_DEVICE);
  61
  62        if (dma_mapping_error(dev, dma))
  63                return -EINVAL;
  64
  65        /* grab the next descriptor */
  66        i = tx_ring->next_to_use;
  67        first = &tx_ring->tx_buf[i];
  68        f_desc = ICE_TX_FDIRDESC(tx_ring, i);
  69        memcpy(f_desc, fdir_desc, sizeof(*f_desc));
  70
  71        i++;
  72        i = (i < tx_ring->count) ? i : 0;
  73        tx_desc = ICE_TX_DESC(tx_ring, i);
  74        tx_buf = &tx_ring->tx_buf[i];
  75
  76        i++;
  77        tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
  78
  79        memset(tx_buf, 0, sizeof(*tx_buf));
  80        dma_unmap_len_set(tx_buf, len, ICE_FDIR_MAX_RAW_PKT_SIZE);
  81        dma_unmap_addr_set(tx_buf, dma, dma);
  82
  83        tx_desc->buf_addr = cpu_to_le64(dma);
  84        td_cmd = ICE_TXD_LAST_DESC_CMD | ICE_TX_DESC_CMD_DUMMY |
  85                 ICE_TX_DESC_CMD_RE;
  86
  87        tx_buf->tx_flags = ICE_TX_FLAGS_DUMMY_PKT;
  88        tx_buf->raw_buf = raw_packet;
  89
  90        tx_desc->cmd_type_offset_bsz =
  91                ice_build_ctob(td_cmd, 0, ICE_FDIR_MAX_RAW_PKT_SIZE, 0);
  92
  93        /* Force memory write to complete before letting h/w know
  94         * there are new descriptors to fetch.
  95         */
  96        wmb();
  97
  98        /* mark the data descriptor to be watched */
  99        first->next_to_watch = tx_desc;
 100
 101        writel(tx_ring->next_to_use, tx_ring->tail);
 102
 103        return 0;
 104}
 105
 106/**
 107 * ice_unmap_and_free_tx_buf - Release a Tx buffer
 108 * @ring: the ring that owns the buffer
 109 * @tx_buf: the buffer to free
 110 */
 111static void
 112ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct ice_tx_buf *tx_buf)
 113{
 114        if (tx_buf->skb) {
 115                if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT)
 116                        devm_kfree(ring->dev, tx_buf->raw_buf);
 117                else if (ice_ring_is_xdp(ring))
 118                        page_frag_free(tx_buf->raw_buf);
 119                else
 120                        dev_kfree_skb_any(tx_buf->skb);
 121                if (dma_unmap_len(tx_buf, len))
 122                        dma_unmap_single(ring->dev,
 123                                         dma_unmap_addr(tx_buf, dma),
 124                                         dma_unmap_len(tx_buf, len),
 125                                         DMA_TO_DEVICE);
 126        } else if (dma_unmap_len(tx_buf, len)) {
 127                dma_unmap_page(ring->dev,
 128                               dma_unmap_addr(tx_buf, dma),
 129                               dma_unmap_len(tx_buf, len),
 130                               DMA_TO_DEVICE);
 131        }
 132
 133        tx_buf->next_to_watch = NULL;
 134        tx_buf->skb = NULL;
 135        dma_unmap_len_set(tx_buf, len, 0);
 136        /* tx_buf must be completely set up in the transmit path */
 137}
 138
 139static struct netdev_queue *txring_txq(const struct ice_tx_ring *ring)
 140{
 141        return netdev_get_tx_queue(ring->netdev, ring->q_index);
 142}
 143
 144/**
 145 * ice_clean_tx_ring - Free any empty Tx buffers
 146 * @tx_ring: ring to be cleaned
 147 */
 148void ice_clean_tx_ring(struct ice_tx_ring *tx_ring)
 149{
 150        u32 size;
 151        u16 i;
 152
 153        if (ice_ring_is_xdp(tx_ring) && tx_ring->xsk_pool) {
 154                ice_xsk_clean_xdp_ring(tx_ring);
 155                goto tx_skip_free;
 156        }
 157
 158        /* ring already cleared, nothing to do */
 159        if (!tx_ring->tx_buf)
 160                return;
 161
 162        /* Free all the Tx ring sk_buffs */
 163        for (i = 0; i < tx_ring->count; i++)
 164                ice_unmap_and_free_tx_buf(tx_ring, &tx_ring->tx_buf[i]);
 165
 166tx_skip_free:
 167        memset(tx_ring->tx_buf, 0, sizeof(*tx_ring->tx_buf) * tx_ring->count);
 168
 169        size = ALIGN(tx_ring->count * sizeof(struct ice_tx_desc),
 170                     PAGE_SIZE);
 171        /* Zero out the descriptor ring */
 172        memset(tx_ring->desc, 0, size);
 173
 174        tx_ring->next_to_use = 0;
 175        tx_ring->next_to_clean = 0;
 176        tx_ring->next_dd = ICE_RING_QUARTER(tx_ring) - 1;
 177        tx_ring->next_rs = ICE_RING_QUARTER(tx_ring) - 1;
 178
 179        if (!tx_ring->netdev)
 180                return;
 181
 182        /* cleanup Tx queue statistics */
 183        netdev_tx_reset_queue(txring_txq(tx_ring));
 184}
 185
 186/**
 187 * ice_free_tx_ring - Free Tx resources per queue
 188 * @tx_ring: Tx descriptor ring for a specific queue
 189 *
 190 * Free all transmit software resources
 191 */
 192void ice_free_tx_ring(struct ice_tx_ring *tx_ring)
 193{
 194        u32 size;
 195
 196        ice_clean_tx_ring(tx_ring);
 197        devm_kfree(tx_ring->dev, tx_ring->tx_buf);
 198        tx_ring->tx_buf = NULL;
 199
 200        if (tx_ring->desc) {
 201                size = ALIGN(tx_ring->count * sizeof(struct ice_tx_desc),
 202                             PAGE_SIZE);
 203                dmam_free_coherent(tx_ring->dev, size,
 204                                   tx_ring->desc, tx_ring->dma);
 205                tx_ring->desc = NULL;
 206        }
 207}
 208
 209/**
 210 * ice_clean_tx_irq - Reclaim resources after transmit completes
 211 * @tx_ring: Tx ring to clean
 212 * @napi_budget: Used to determine if we are in netpoll
 213 *
 214 * Returns true if there's any budget left (e.g. the clean is finished)
 215 */
 216static bool ice_clean_tx_irq(struct ice_tx_ring *tx_ring, int napi_budget)
 217{
 218        unsigned int total_bytes = 0, total_pkts = 0;
 219        unsigned int budget = ICE_DFLT_IRQ_WORK;
 220        struct ice_vsi *vsi = tx_ring->vsi;
 221        s16 i = tx_ring->next_to_clean;
 222        struct ice_tx_desc *tx_desc;
 223        struct ice_tx_buf *tx_buf;
 224
 225        /* get the bql data ready */
 226        netdev_txq_bql_complete_prefetchw(txring_txq(tx_ring));
 227
 228        tx_buf = &tx_ring->tx_buf[i];
 229        tx_desc = ICE_TX_DESC(tx_ring, i);
 230        i -= tx_ring->count;
 231
 232        prefetch(&vsi->state);
 233
 234        do {
 235                struct ice_tx_desc *eop_desc = tx_buf->next_to_watch;
 236
 237                /* if next_to_watch is not set then there is no work pending */
 238                if (!eop_desc)
 239                        break;
 240
 241                /* follow the guidelines of other drivers */
 242                prefetchw(&tx_buf->skb->users);
 243
 244                smp_rmb();      /* prevent any other reads prior to eop_desc */
 245
 246                ice_trace(clean_tx_irq, tx_ring, tx_desc, tx_buf);
 247                /* if the descriptor isn't done, no work yet to do */
 248                if (!(eop_desc->cmd_type_offset_bsz &
 249                      cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE)))
 250                        break;
 251
 252                /* clear next_to_watch to prevent false hangs */
 253                tx_buf->next_to_watch = NULL;
 254
 255                /* update the statistics for this packet */
 256                total_bytes += tx_buf->bytecount;
 257                total_pkts += tx_buf->gso_segs;
 258
 259                /* free the skb */
 260                napi_consume_skb(tx_buf->skb, napi_budget);
 261
 262                /* unmap skb header data */
 263                dma_unmap_single(tx_ring->dev,
 264                                 dma_unmap_addr(tx_buf, dma),
 265                                 dma_unmap_len(tx_buf, len),
 266                                 DMA_TO_DEVICE);
 267
 268                /* clear tx_buf data */
 269                tx_buf->skb = NULL;
 270                dma_unmap_len_set(tx_buf, len, 0);
 271
 272                /* unmap remaining buffers */
 273                while (tx_desc != eop_desc) {
 274                        ice_trace(clean_tx_irq_unmap, tx_ring, tx_desc, tx_buf);
 275                        tx_buf++;
 276                        tx_desc++;
 277                        i++;
 278                        if (unlikely(!i)) {
 279                                i -= tx_ring->count;
 280                                tx_buf = tx_ring->tx_buf;
 281                                tx_desc = ICE_TX_DESC(tx_ring, 0);
 282                        }
 283
 284                        /* unmap any remaining paged data */
 285                        if (dma_unmap_len(tx_buf, len)) {
 286                                dma_unmap_page(tx_ring->dev,
 287                                               dma_unmap_addr(tx_buf, dma),
 288                                               dma_unmap_len(tx_buf, len),
 289                                               DMA_TO_DEVICE);
 290                                dma_unmap_len_set(tx_buf, len, 0);
 291                        }
 292                }
 293                ice_trace(clean_tx_irq_unmap_eop, tx_ring, tx_desc, tx_buf);
 294
 295                /* move us one more past the eop_desc for start of next pkt */
 296                tx_buf++;
 297                tx_desc++;
 298                i++;
 299                if (unlikely(!i)) {
 300                        i -= tx_ring->count;
 301                        tx_buf = tx_ring->tx_buf;
 302                        tx_desc = ICE_TX_DESC(tx_ring, 0);
 303                }
 304
 305                prefetch(tx_desc);
 306
 307                /* update budget accounting */
 308                budget--;
 309        } while (likely(budget));
 310
 311        i += tx_ring->count;
 312        tx_ring->next_to_clean = i;
 313
 314        ice_update_tx_ring_stats(tx_ring, total_pkts, total_bytes);
 315        netdev_tx_completed_queue(txring_txq(tx_ring), total_pkts, total_bytes);
 316
 317#define TX_WAKE_THRESHOLD ((s16)(DESC_NEEDED * 2))
 318        if (unlikely(total_pkts && netif_carrier_ok(tx_ring->netdev) &&
 319                     (ICE_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) {
 320                /* Make sure that anybody stopping the queue after this
 321                 * sees the new next_to_clean.
 322                 */
 323                smp_mb();
 324                if (netif_tx_queue_stopped(txring_txq(tx_ring)) &&
 325                    !test_bit(ICE_VSI_DOWN, vsi->state)) {
 326                        netif_tx_wake_queue(txring_txq(tx_ring));
 327                        ++tx_ring->tx_stats.restart_q;
 328                }
 329        }
 330
 331        return !!budget;
 332}
 333
 334/**
 335 * ice_setup_tx_ring - Allocate the Tx descriptors
 336 * @tx_ring: the Tx ring to set up
 337 *
 338 * Return 0 on success, negative on error
 339 */
 340int ice_setup_tx_ring(struct ice_tx_ring *tx_ring)
 341{
 342        struct device *dev = tx_ring->dev;
 343        u32 size;
 344
 345        if (!dev)
 346                return -ENOMEM;
 347
 348        /* warn if we are about to overwrite the pointer */
 349        WARN_ON(tx_ring->tx_buf);
 350        tx_ring->tx_buf =
 351                devm_kcalloc(dev, sizeof(*tx_ring->tx_buf), tx_ring->count,
 352                             GFP_KERNEL);
 353        if (!tx_ring->tx_buf)
 354                return -ENOMEM;
 355
 356        /* round up to nearest page */
 357        size = ALIGN(tx_ring->count * sizeof(struct ice_tx_desc),
 358                     PAGE_SIZE);
 359        tx_ring->desc = dmam_alloc_coherent(dev, size, &tx_ring->dma,
 360                                            GFP_KERNEL);
 361        if (!tx_ring->desc) {
 362                dev_err(dev, "Unable to allocate memory for the Tx descriptor ring, size=%d\n",
 363                        size);
 364                goto err;
 365        }
 366
 367        tx_ring->next_to_use = 0;
 368        tx_ring->next_to_clean = 0;
 369        tx_ring->tx_stats.prev_pkt = -1;
 370        return 0;
 371
 372err:
 373        devm_kfree(dev, tx_ring->tx_buf);
 374        tx_ring->tx_buf = NULL;
 375        return -ENOMEM;
 376}
 377
 378/**
 379 * ice_clean_rx_ring - Free Rx buffers
 380 * @rx_ring: ring to be cleaned
 381 */
 382void ice_clean_rx_ring(struct ice_rx_ring *rx_ring)
 383{
 384        struct device *dev = rx_ring->dev;
 385        u32 size;
 386        u16 i;
 387
 388        /* ring already cleared, nothing to do */
 389        if (!rx_ring->rx_buf)
 390                return;
 391
 392        if (rx_ring->skb) {
 393                dev_kfree_skb(rx_ring->skb);
 394                rx_ring->skb = NULL;
 395        }
 396
 397        if (rx_ring->xsk_pool) {
 398                ice_xsk_clean_rx_ring(rx_ring);
 399                goto rx_skip_free;
 400        }
 401
 402        /* Free all the Rx ring sk_buffs */
 403        for (i = 0; i < rx_ring->count; i++) {
 404                struct ice_rx_buf *rx_buf = &rx_ring->rx_buf[i];
 405
 406                if (!rx_buf->page)
 407                        continue;
 408
 409                /* Invalidate cache lines that may have been written to by
 410                 * device so that we avoid corrupting memory.
 411                 */
 412                dma_sync_single_range_for_cpu(dev, rx_buf->dma,
 413                                              rx_buf->page_offset,
 414                                              rx_ring->rx_buf_len,
 415                                              DMA_FROM_DEVICE);
 416
 417                /* free resources associated with mapping */
 418                dma_unmap_page_attrs(dev, rx_buf->dma, ice_rx_pg_size(rx_ring),
 419                                     DMA_FROM_DEVICE, ICE_RX_DMA_ATTR);
 420                __page_frag_cache_drain(rx_buf->page, rx_buf->pagecnt_bias);
 421
 422                rx_buf->page = NULL;
 423                rx_buf->page_offset = 0;
 424        }
 425
 426rx_skip_free:
 427        if (rx_ring->xsk_pool)
 428                memset(rx_ring->xdp_buf, 0, array_size(rx_ring->count, sizeof(*rx_ring->xdp_buf)));
 429        else
 430                memset(rx_ring->rx_buf, 0, array_size(rx_ring->count, sizeof(*rx_ring->rx_buf)));
 431
 432        /* Zero out the descriptor ring */
 433        size = ALIGN(rx_ring->count * sizeof(union ice_32byte_rx_desc),
 434                     PAGE_SIZE);
 435        memset(rx_ring->desc, 0, size);
 436
 437        rx_ring->next_to_alloc = 0;
 438        rx_ring->next_to_clean = 0;
 439        rx_ring->next_to_use = 0;
 440}
 441
 442/**
 443 * ice_free_rx_ring - Free Rx resources
 444 * @rx_ring: ring to clean the resources from
 445 *
 446 * Free all receive software resources
 447 */
 448void ice_free_rx_ring(struct ice_rx_ring *rx_ring)
 449{
 450        u32 size;
 451
 452        ice_clean_rx_ring(rx_ring);
 453        if (rx_ring->vsi->type == ICE_VSI_PF)
 454                if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
 455                        xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
 456        rx_ring->xdp_prog = NULL;
 457        if (rx_ring->xsk_pool) {
 458                kfree(rx_ring->xdp_buf);
 459                rx_ring->xdp_buf = NULL;
 460        } else {
 461                kfree(rx_ring->rx_buf);
 462                rx_ring->rx_buf = NULL;
 463        }
 464
 465        if (rx_ring->desc) {
 466                size = ALIGN(rx_ring->count * sizeof(union ice_32byte_rx_desc),
 467                             PAGE_SIZE);
 468                dmam_free_coherent(rx_ring->dev, size,
 469                                   rx_ring->desc, rx_ring->dma);
 470                rx_ring->desc = NULL;
 471        }
 472}
 473
 474/**
 475 * ice_setup_rx_ring - Allocate the Rx descriptors
 476 * @rx_ring: the Rx ring to set up
 477 *
 478 * Return 0 on success, negative on error
 479 */
 480int ice_setup_rx_ring(struct ice_rx_ring *rx_ring)
 481{
 482        struct device *dev = rx_ring->dev;
 483        u32 size;
 484
 485        if (!dev)
 486                return -ENOMEM;
 487
 488        /* warn if we are about to overwrite the pointer */
 489        WARN_ON(rx_ring->rx_buf);
 490        rx_ring->rx_buf =
 491                kcalloc(rx_ring->count, sizeof(*rx_ring->rx_buf), GFP_KERNEL);
 492        if (!rx_ring->rx_buf)
 493                return -ENOMEM;
 494
 495        /* round up to nearest page */
 496        size = ALIGN(rx_ring->count * sizeof(union ice_32byte_rx_desc),
 497                     PAGE_SIZE);
 498        rx_ring->desc = dmam_alloc_coherent(dev, size, &rx_ring->dma,
 499                                            GFP_KERNEL);
 500        if (!rx_ring->desc) {
 501                dev_err(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n",
 502                        size);
 503                goto err;
 504        }
 505
 506        rx_ring->next_to_use = 0;
 507        rx_ring->next_to_clean = 0;
 508
 509        if (ice_is_xdp_ena_vsi(rx_ring->vsi))
 510                WRITE_ONCE(rx_ring->xdp_prog, rx_ring->vsi->xdp_prog);
 511
 512        if (rx_ring->vsi->type == ICE_VSI_PF &&
 513            !xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
 514                if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
 515                                     rx_ring->q_index, rx_ring->q_vector->napi.napi_id))
 516                        goto err;
 517        return 0;
 518
 519err:
 520        kfree(rx_ring->rx_buf);
 521        rx_ring->rx_buf = NULL;
 522        return -ENOMEM;
 523}
 524
 525static unsigned int
 526ice_rx_frame_truesize(struct ice_rx_ring *rx_ring, unsigned int __maybe_unused size)
 527{
 528        unsigned int truesize;
 529
 530#if (PAGE_SIZE < 8192)
 531        truesize = ice_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */
 532#else
 533        truesize = rx_ring->rx_offset ?
 534                SKB_DATA_ALIGN(rx_ring->rx_offset + size) +
 535                SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) :
 536                SKB_DATA_ALIGN(size);
 537#endif
 538        return truesize;
 539}
 540
 541/**
 542 * ice_run_xdp - Executes an XDP program on initialized xdp_buff
 543 * @rx_ring: Rx ring
 544 * @xdp: xdp_buff used as input to the XDP program
 545 * @xdp_prog: XDP program to run
 546 * @xdp_ring: ring to be used for XDP_TX action
 547 *
 548 * Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR}
 549 */
 550static int
 551ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
 552            struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring)
 553{
 554        int err;
 555        u32 act;
 556
 557        act = bpf_prog_run_xdp(xdp_prog, xdp);
 558        switch (act) {
 559        case XDP_PASS:
 560                return ICE_XDP_PASS;
 561        case XDP_TX:
 562                if (static_branch_unlikely(&ice_xdp_locking_key))
 563                        spin_lock(&xdp_ring->tx_lock);
 564                err = ice_xmit_xdp_ring(xdp->data, xdp->data_end - xdp->data, xdp_ring);
 565                if (static_branch_unlikely(&ice_xdp_locking_key))
 566                        spin_unlock(&xdp_ring->tx_lock);
 567                if (err == ICE_XDP_CONSUMED)
 568                        goto out_failure;
 569                return err;
 570        case XDP_REDIRECT:
 571                err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
 572                if (err)
 573                        goto out_failure;
 574                return ICE_XDP_REDIR;
 575        default:
 576                bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, act);
 577                fallthrough;
 578        case XDP_ABORTED:
 579out_failure:
 580                trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
 581                fallthrough;
 582        case XDP_DROP:
 583                return ICE_XDP_CONSUMED;
 584        }
 585}
 586
 587/**
 588 * ice_xdp_xmit - submit packets to XDP ring for transmission
 589 * @dev: netdev
 590 * @n: number of XDP frames to be transmitted
 591 * @frames: XDP frames to be transmitted
 592 * @flags: transmit flags
 593 *
 594 * Returns number of frames successfully sent. Failed frames
 595 * will be free'ed by XDP core.
 596 * For error cases, a negative errno code is returned and no-frames
 597 * are transmitted (caller must handle freeing frames).
 598 */
 599int
 600ice_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 601             u32 flags)
 602{
 603        struct ice_netdev_priv *np = netdev_priv(dev);
 604        unsigned int queue_index = smp_processor_id();
 605        struct ice_vsi *vsi = np->vsi;
 606        struct ice_tx_ring *xdp_ring;
 607        int nxmit = 0, i;
 608
 609        if (test_bit(ICE_VSI_DOWN, vsi->state))
 610                return -ENETDOWN;
 611
 612        if (!ice_is_xdp_ena_vsi(vsi) || queue_index >= vsi->num_xdp_txq)
 613                return -ENXIO;
 614
 615        if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
 616                return -EINVAL;
 617
 618        if (static_branch_unlikely(&ice_xdp_locking_key)) {
 619                queue_index %= vsi->num_xdp_txq;
 620                xdp_ring = vsi->xdp_rings[queue_index];
 621                spin_lock(&xdp_ring->tx_lock);
 622        } else {
 623                xdp_ring = vsi->xdp_rings[queue_index];
 624        }
 625
 626        for (i = 0; i < n; i++) {
 627                struct xdp_frame *xdpf = frames[i];
 628                int err;
 629
 630                err = ice_xmit_xdp_ring(xdpf->data, xdpf->len, xdp_ring);
 631                if (err != ICE_XDP_TX)
 632                        break;
 633                nxmit++;
 634        }
 635
 636        if (unlikely(flags & XDP_XMIT_FLUSH))
 637                ice_xdp_ring_update_tail(xdp_ring);
 638
 639        if (static_branch_unlikely(&ice_xdp_locking_key))
 640                spin_unlock(&xdp_ring->tx_lock);
 641
 642        return nxmit;
 643}
 644
 645/**
 646 * ice_alloc_mapped_page - recycle or make a new page
 647 * @rx_ring: ring to use
 648 * @bi: rx_buf struct to modify
 649 *
 650 * Returns true if the page was successfully allocated or
 651 * reused.
 652 */
 653static bool
 654ice_alloc_mapped_page(struct ice_rx_ring *rx_ring, struct ice_rx_buf *bi)
 655{
 656        struct page *page = bi->page;
 657        dma_addr_t dma;
 658
 659        /* since we are recycling buffers we should seldom need to alloc */
 660        if (likely(page))
 661                return true;
 662
 663        /* alloc new page for storage */
 664        page = dev_alloc_pages(ice_rx_pg_order(rx_ring));
 665        if (unlikely(!page)) {
 666                rx_ring->rx_stats.alloc_page_failed++;
 667                return false;
 668        }
 669
 670        /* map page for use */
 671        dma = dma_map_page_attrs(rx_ring->dev, page, 0, ice_rx_pg_size(rx_ring),
 672                                 DMA_FROM_DEVICE, ICE_RX_DMA_ATTR);
 673
 674        /* if mapping failed free memory back to system since
 675         * there isn't much point in holding memory we can't use
 676         */
 677        if (dma_mapping_error(rx_ring->dev, dma)) {
 678                __free_pages(page, ice_rx_pg_order(rx_ring));
 679                rx_ring->rx_stats.alloc_page_failed++;
 680                return false;
 681        }
 682
 683        bi->dma = dma;
 684        bi->page = page;
 685        bi->page_offset = rx_ring->rx_offset;
 686        page_ref_add(page, USHRT_MAX - 1);
 687        bi->pagecnt_bias = USHRT_MAX;
 688
 689        return true;
 690}
 691
 692/**
 693 * ice_alloc_rx_bufs - Replace used receive buffers
 694 * @rx_ring: ring to place buffers on
 695 * @cleaned_count: number of buffers to replace
 696 *
 697 * Returns false if all allocations were successful, true if any fail. Returning
 698 * true signals to the caller that we didn't replace cleaned_count buffers and
 699 * there is more work to do.
 700 *
 701 * First, try to clean "cleaned_count" Rx buffers. Then refill the cleaned Rx
 702 * buffers. Then bump tail at most one time. Grouping like this lets us avoid
 703 * multiple tail writes per call.
 704 */
 705bool ice_alloc_rx_bufs(struct ice_rx_ring *rx_ring, u16 cleaned_count)
 706{
 707        union ice_32b_rx_flex_desc *rx_desc;
 708        u16 ntu = rx_ring->next_to_use;
 709        struct ice_rx_buf *bi;
 710
 711        /* do nothing if no valid netdev defined */
 712        if ((!rx_ring->netdev && rx_ring->vsi->type != ICE_VSI_CTRL) ||
 713            !cleaned_count)
 714                return false;
 715
 716        /* get the Rx descriptor and buffer based on next_to_use */
 717        rx_desc = ICE_RX_DESC(rx_ring, ntu);
 718        bi = &rx_ring->rx_buf[ntu];
 719
 720        do {
 721                /* if we fail here, we have work remaining */
 722                if (!ice_alloc_mapped_page(rx_ring, bi))
 723                        break;
 724
 725                /* sync the buffer for use by the device */
 726                dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
 727                                                 bi->page_offset,
 728                                                 rx_ring->rx_buf_len,
 729                                                 DMA_FROM_DEVICE);
 730
 731                /* Refresh the desc even if buffer_addrs didn't change
 732                 * because each write-back erases this info.
 733                 */
 734                rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
 735
 736                rx_desc++;
 737                bi++;
 738                ntu++;
 739                if (unlikely(ntu == rx_ring->count)) {
 740                        rx_desc = ICE_RX_DESC(rx_ring, 0);
 741                        bi = rx_ring->rx_buf;
 742                        ntu = 0;
 743                }
 744
 745                /* clear the status bits for the next_to_use descriptor */
 746                rx_desc->wb.status_error0 = 0;
 747
 748                cleaned_count--;
 749        } while (cleaned_count);
 750
 751        if (rx_ring->next_to_use != ntu)
 752                ice_release_rx_desc(rx_ring, ntu);
 753
 754        return !!cleaned_count;
 755}
 756
 757/**
 758 * ice_rx_buf_adjust_pg_offset - Prepare Rx buffer for reuse
 759 * @rx_buf: Rx buffer to adjust
 760 * @size: Size of adjustment
 761 *
 762 * Update the offset within page so that Rx buf will be ready to be reused.
 763 * For systems with PAGE_SIZE < 8192 this function will flip the page offset
 764 * so the second half of page assigned to Rx buffer will be used, otherwise
 765 * the offset is moved by "size" bytes
 766 */
 767static void
 768ice_rx_buf_adjust_pg_offset(struct ice_rx_buf *rx_buf, unsigned int size)
 769{
 770#if (PAGE_SIZE < 8192)
 771        /* flip page offset to other buffer */
 772        rx_buf->page_offset ^= size;
 773#else
 774        /* move offset up to the next cache line */
 775        rx_buf->page_offset += size;
 776#endif
 777}
 778
 779/**
 780 * ice_can_reuse_rx_page - Determine if page can be reused for another Rx
 781 * @rx_buf: buffer containing the page
 782 * @rx_buf_pgcnt: rx_buf page refcount pre xdp_do_redirect() call
 783 *
 784 * If page is reusable, we have a green light for calling ice_reuse_rx_page,
 785 * which will assign the current buffer to the buffer that next_to_alloc is
 786 * pointing to; otherwise, the DMA mapping needs to be destroyed and
 787 * page freed
 788 */
 789static bool
 790ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf, int rx_buf_pgcnt)
 791{
 792        unsigned int pagecnt_bias = rx_buf->pagecnt_bias;
 793        struct page *page = rx_buf->page;
 794
 795        /* avoid re-using remote and pfmemalloc pages */
 796        if (!dev_page_is_reusable(page))
 797                return false;
 798
 799#if (PAGE_SIZE < 8192)
 800        /* if we are only owner of page we can reuse it */
 801        if (unlikely((rx_buf_pgcnt - pagecnt_bias) > 1))
 802                return false;
 803#else
 804#define ICE_LAST_OFFSET \
 805        (SKB_WITH_OVERHEAD(PAGE_SIZE) - ICE_RXBUF_2048)
 806        if (rx_buf->page_offset > ICE_LAST_OFFSET)
 807                return false;
 808#endif /* PAGE_SIZE < 8192) */
 809
 810        /* If we have drained the page fragment pool we need to update
 811         * the pagecnt_bias and page count so that we fully restock the
 812         * number of references the driver holds.
 813         */
 814        if (unlikely(pagecnt_bias == 1)) {
 815                page_ref_add(page, USHRT_MAX - 1);
 816                rx_buf->pagecnt_bias = USHRT_MAX;
 817        }
 818
 819        return true;
 820}
 821
 822/**
 823 * ice_add_rx_frag - Add contents of Rx buffer to sk_buff as a frag
 824 * @rx_ring: Rx descriptor ring to transact packets on
 825 * @rx_buf: buffer containing page to add
 826 * @skb: sk_buff to place the data into
 827 * @size: packet length from rx_desc
 828 *
 829 * This function will add the data contained in rx_buf->page to the skb.
 830 * It will just attach the page as a frag to the skb.
 831 * The function will then update the page offset.
 832 */
 833static void
 834ice_add_rx_frag(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf,
 835                struct sk_buff *skb, unsigned int size)
 836{
 837#if (PAGE_SIZE >= 8192)
 838        unsigned int truesize = SKB_DATA_ALIGN(size + rx_ring->rx_offset);
 839#else
 840        unsigned int truesize = ice_rx_pg_size(rx_ring) / 2;
 841#endif
 842
 843        if (!size)
 844                return;
 845        skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buf->page,
 846                        rx_buf->page_offset, size, truesize);
 847
 848        /* page is being used so we must update the page offset */
 849        ice_rx_buf_adjust_pg_offset(rx_buf, truesize);
 850}
 851
 852/**
 853 * ice_reuse_rx_page - page flip buffer and store it back on the ring
 854 * @rx_ring: Rx descriptor ring to store buffers on
 855 * @old_buf: donor buffer to have page reused
 856 *
 857 * Synchronizes page for reuse by the adapter
 858 */
 859static void
 860ice_reuse_rx_page(struct ice_rx_ring *rx_ring, struct ice_rx_buf *old_buf)
 861{
 862        u16 nta = rx_ring->next_to_alloc;
 863        struct ice_rx_buf *new_buf;
 864
 865        new_buf = &rx_ring->rx_buf[nta];
 866
 867        /* update, and store next to alloc */
 868        nta++;
 869        rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
 870
 871        /* Transfer page from old buffer to new buffer.
 872         * Move each member individually to avoid possible store
 873         * forwarding stalls and unnecessary copy of skb.
 874         */
 875        new_buf->dma = old_buf->dma;
 876        new_buf->page = old_buf->page;
 877        new_buf->page_offset = old_buf->page_offset;
 878        new_buf->pagecnt_bias = old_buf->pagecnt_bias;
 879}
 880
 881/**
 882 * ice_get_rx_buf - Fetch Rx buffer and synchronize data for use
 883 * @rx_ring: Rx descriptor ring to transact packets on
 884 * @size: size of buffer to add to skb
 885 * @rx_buf_pgcnt: rx_buf page refcount
 886 *
 887 * This function will pull an Rx buffer from the ring and synchronize it
 888 * for use by the CPU.
 889 */
 890static struct ice_rx_buf *
 891ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size,
 892               int *rx_buf_pgcnt)
 893{
 894        struct ice_rx_buf *rx_buf;
 895
 896        rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean];
 897        *rx_buf_pgcnt =
 898#if (PAGE_SIZE < 8192)
 899                page_count(rx_buf->page);
 900#else
 901                0;
 902#endif
 903        prefetchw(rx_buf->page);
 904
 905        if (!size)
 906                return rx_buf;
 907        /* we are reusing so sync this buffer for CPU use */
 908        dma_sync_single_range_for_cpu(rx_ring->dev, rx_buf->dma,
 909                                      rx_buf->page_offset, size,
 910                                      DMA_FROM_DEVICE);
 911
 912        /* We have pulled a buffer for use, so decrement pagecnt_bias */
 913        rx_buf->pagecnt_bias--;
 914
 915        return rx_buf;
 916}
 917
 918/**
 919 * ice_build_skb - Build skb around an existing buffer
 920 * @rx_ring: Rx descriptor ring to transact packets on
 921 * @rx_buf: Rx buffer to pull data from
 922 * @xdp: xdp_buff pointing to the data
 923 *
 924 * This function builds an skb around an existing Rx buffer, taking care
 925 * to set up the skb correctly and avoid any memcpy overhead.
 926 */
 927static struct sk_buff *
 928ice_build_skb(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf,
 929              struct xdp_buff *xdp)
 930{
 931        u8 metasize = xdp->data - xdp->data_meta;
 932#if (PAGE_SIZE < 8192)
 933        unsigned int truesize = ice_rx_pg_size(rx_ring) / 2;
 934#else
 935        unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
 936                                SKB_DATA_ALIGN(xdp->data_end -
 937                                               xdp->data_hard_start);
 938#endif
 939        struct sk_buff *skb;
 940
 941        /* Prefetch first cache line of first page. If xdp->data_meta
 942         * is unused, this points exactly as xdp->data, otherwise we
 943         * likely have a consumer accessing first few bytes of meta
 944         * data, and then actual data.
 945         */
 946        net_prefetch(xdp->data_meta);
 947        /* build an skb around the page buffer */
 948        skb = napi_build_skb(xdp->data_hard_start, truesize);
 949        if (unlikely(!skb))
 950                return NULL;
 951
 952        /* must to record Rx queue, otherwise OS features such as
 953         * symmetric queue won't work
 954         */
 955        skb_record_rx_queue(skb, rx_ring->q_index);
 956
 957        /* update pointers within the skb to store the data */
 958        skb_reserve(skb, xdp->data - xdp->data_hard_start);
 959        __skb_put(skb, xdp->data_end - xdp->data);
 960        if (metasize)
 961                skb_metadata_set(skb, metasize);
 962
 963        /* buffer is used by skb, update page_offset */
 964        ice_rx_buf_adjust_pg_offset(rx_buf, truesize);
 965
 966        return skb;
 967}
 968
 969/**
 970 * ice_construct_skb - Allocate skb and populate it
 971 * @rx_ring: Rx descriptor ring to transact packets on
 972 * @rx_buf: Rx buffer to pull data from
 973 * @xdp: xdp_buff pointing to the data
 974 *
 975 * This function allocates an skb. It then populates it with the page
 976 * data from the current receive descriptor, taking care to set up the
 977 * skb correctly.
 978 */
 979static struct sk_buff *
 980ice_construct_skb(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf,
 981                  struct xdp_buff *xdp)
 982{
 983        unsigned int metasize = xdp->data - xdp->data_meta;
 984        unsigned int size = xdp->data_end - xdp->data;
 985        unsigned int headlen;
 986        struct sk_buff *skb;
 987
 988        /* prefetch first cache line of first page */
 989        net_prefetch(xdp->data_meta);
 990
 991        /* allocate a skb to store the frags */
 992        skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
 993                               ICE_RX_HDR_SIZE + metasize,
 994                               GFP_ATOMIC | __GFP_NOWARN);
 995        if (unlikely(!skb))
 996                return NULL;
 997
 998        skb_record_rx_queue(skb, rx_ring->q_index);
 999        /* Determine available headroom for copy */
1000        headlen = size;
1001        if (headlen > ICE_RX_HDR_SIZE)
1002                headlen = eth_get_headlen(skb->dev, xdp->data, ICE_RX_HDR_SIZE);
1003
1004        /* align pull length to size of long to optimize memcpy performance */
1005        memcpy(__skb_put(skb, headlen + metasize), xdp->data_meta,
1006               ALIGN(headlen + metasize, sizeof(long)));
1007
1008        if (metasize) {
1009                skb_metadata_set(skb, metasize);
1010                __skb_pull(skb, metasize);
1011        }
1012
1013        /* if we exhaust the linear part then add what is left as a frag */
1014        size -= headlen;
1015        if (size) {
1016#if (PAGE_SIZE >= 8192)
1017                unsigned int truesize = SKB_DATA_ALIGN(size);
1018#else
1019                unsigned int truesize = ice_rx_pg_size(rx_ring) / 2;
1020#endif
1021                skb_add_rx_frag(skb, 0, rx_buf->page,
1022                                rx_buf->page_offset + headlen, size, truesize);
1023                /* buffer is used by skb, update page_offset */
1024                ice_rx_buf_adjust_pg_offset(rx_buf, truesize);
1025        } else {
1026                /* buffer is unused, reset bias back to rx_buf; data was copied
1027                 * onto skb's linear part so there's no need for adjusting
1028                 * page offset and we can reuse this buffer as-is
1029                 */
1030                rx_buf->pagecnt_bias++;
1031        }
1032
1033        return skb;
1034}
1035
1036/**
1037 * ice_put_rx_buf - Clean up used buffer and either recycle or free
1038 * @rx_ring: Rx descriptor ring to transact packets on
1039 * @rx_buf: Rx buffer to pull data from
1040 * @rx_buf_pgcnt: Rx buffer page count pre xdp_do_redirect()
1041 *
1042 * This function will update next_to_clean and then clean up the contents
1043 * of the rx_buf. It will either recycle the buffer or unmap it and free
1044 * the associated resources.
1045 */
1046static void
1047ice_put_rx_buf(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf,
1048               int rx_buf_pgcnt)
1049{
1050        u16 ntc = rx_ring->next_to_clean + 1;
1051
1052        /* fetch, update, and store next to clean */
1053        ntc = (ntc < rx_ring->count) ? ntc : 0;
1054        rx_ring->next_to_clean = ntc;
1055
1056        if (!rx_buf)
1057                return;
1058
1059        if (ice_can_reuse_rx_page(rx_buf, rx_buf_pgcnt)) {
1060                /* hand second half of page back to the ring */
1061                ice_reuse_rx_page(rx_ring, rx_buf);
1062        } else {
1063                /* we are not reusing the buffer so unmap it */
1064                dma_unmap_page_attrs(rx_ring->dev, rx_buf->dma,
1065                                     ice_rx_pg_size(rx_ring), DMA_FROM_DEVICE,
1066                                     ICE_RX_DMA_ATTR);
1067                __page_frag_cache_drain(rx_buf->page, rx_buf->pagecnt_bias);
1068        }
1069
1070        /* clear contents of buffer_info */
1071        rx_buf->page = NULL;
1072}
1073
1074/**
1075 * ice_is_non_eop - process handling of non-EOP buffers
1076 * @rx_ring: Rx ring being processed
1077 * @rx_desc: Rx descriptor for current buffer
1078 *
1079 * If the buffer is an EOP buffer, this function exits returning false,
1080 * otherwise return true indicating that this is in fact a non-EOP buffer.
1081 */
1082static bool
1083ice_is_non_eop(struct ice_rx_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc)
1084{
1085        /* if we are the last buffer then there is nothing else to do */
1086#define ICE_RXD_EOF BIT(ICE_RX_FLEX_DESC_STATUS0_EOF_S)
1087        if (likely(ice_test_staterr(rx_desc->wb.status_error0, ICE_RXD_EOF)))
1088                return false;
1089
1090        rx_ring->rx_stats.non_eop_descs++;
1091
1092        return true;
1093}
1094
1095/**
1096 * ice_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
1097 * @rx_ring: Rx descriptor ring to transact packets on
1098 * @budget: Total limit on number of packets to process
1099 *
1100 * This function provides a "bounce buffer" approach to Rx interrupt
1101 * processing. The advantage to this is that on systems that have
1102 * expensive overhead for IOMMU access this provides a means of avoiding
1103 * it by maintaining the mapping of the page to the system.
1104 *
1105 * Returns amount of work completed
1106 */
1107int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
1108{
1109        unsigned int total_rx_bytes = 0, total_rx_pkts = 0, frame_sz = 0;
1110        u16 cleaned_count = ICE_DESC_UNUSED(rx_ring);
1111        unsigned int offset = rx_ring->rx_offset;
1112        struct ice_tx_ring *xdp_ring = NULL;
1113        unsigned int xdp_res, xdp_xmit = 0;
1114        struct sk_buff *skb = rx_ring->skb;
1115        struct bpf_prog *xdp_prog = NULL;
1116        struct xdp_buff xdp;
1117        bool failure;
1118
1119        /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */
1120#if (PAGE_SIZE < 8192)
1121        frame_sz = ice_rx_frame_truesize(rx_ring, 0);
1122#endif
1123        xdp_init_buff(&xdp, frame_sz, &rx_ring->xdp_rxq);
1124
1125        xdp_prog = READ_ONCE(rx_ring->xdp_prog);
1126        if (xdp_prog)
1127                xdp_ring = rx_ring->xdp_ring;
1128
1129        /* start the loop to process Rx packets bounded by 'budget' */
1130        while (likely(total_rx_pkts < (unsigned int)budget)) {
1131                union ice_32b_rx_flex_desc *rx_desc;
1132                struct ice_rx_buf *rx_buf;
1133                unsigned char *hard_start;
1134                unsigned int size;
1135                u16 stat_err_bits;
1136                int rx_buf_pgcnt;
1137                u16 vlan_tag = 0;
1138                u16 rx_ptype;
1139
1140                /* get the Rx desc from Rx ring based on 'next_to_clean' */
1141                rx_desc = ICE_RX_DESC(rx_ring, rx_ring->next_to_clean);
1142
1143                /* status_error_len will always be zero for unused descriptors
1144                 * because it's cleared in cleanup, and overlaps with hdr_addr
1145                 * which is always zero because packet split isn't used, if the
1146                 * hardware wrote DD then it will be non-zero
1147                 */
1148                stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_DD_S);
1149                if (!ice_test_staterr(rx_desc->wb.status_error0, stat_err_bits))
1150                        break;
1151
1152                /* This memory barrier is needed to keep us from reading
1153                 * any other fields out of the rx_desc until we know the
1154                 * DD bit is set.
1155                 */
1156                dma_rmb();
1157
1158                ice_trace(clean_rx_irq, rx_ring, rx_desc);
1159                if (rx_desc->wb.rxdid == FDIR_DESC_RXDID || !rx_ring->netdev) {
1160                        struct ice_vsi *ctrl_vsi = rx_ring->vsi;
1161
1162                        if (rx_desc->wb.rxdid == FDIR_DESC_RXDID &&
1163                            ctrl_vsi->vf)
1164                                ice_vc_fdir_irq_handler(ctrl_vsi, rx_desc);
1165                        ice_put_rx_buf(rx_ring, NULL, 0);
1166                        cleaned_count++;
1167                        continue;
1168                }
1169
1170                size = le16_to_cpu(rx_desc->wb.pkt_len) &
1171                        ICE_RX_FLX_DESC_PKT_LEN_M;
1172
1173                /* retrieve a buffer from the ring */
1174                rx_buf = ice_get_rx_buf(rx_ring, size, &rx_buf_pgcnt);
1175
1176                if (!size) {
1177                        xdp.data = NULL;
1178                        xdp.data_end = NULL;
1179                        xdp.data_hard_start = NULL;
1180                        xdp.data_meta = NULL;
1181                        goto construct_skb;
1182                }
1183
1184                hard_start = page_address(rx_buf->page) + rx_buf->page_offset -
1185                             offset;
1186                xdp_prepare_buff(&xdp, hard_start, offset, size, true);
1187#if (PAGE_SIZE > 4096)
1188                /* At larger PAGE_SIZE, frame_sz depend on len size */
1189                xdp.frame_sz = ice_rx_frame_truesize(rx_ring, size);
1190#endif
1191
1192                if (!xdp_prog)
1193                        goto construct_skb;
1194
1195                xdp_res = ice_run_xdp(rx_ring, &xdp, xdp_prog, xdp_ring);
1196                if (!xdp_res)
1197                        goto construct_skb;
1198                if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) {
1199                        xdp_xmit |= xdp_res;
1200                        ice_rx_buf_adjust_pg_offset(rx_buf, xdp.frame_sz);
1201                } else {
1202                        rx_buf->pagecnt_bias++;
1203                }
1204                total_rx_bytes += size;
1205                total_rx_pkts++;
1206
1207                cleaned_count++;
1208                ice_put_rx_buf(rx_ring, rx_buf, rx_buf_pgcnt);
1209                continue;
1210construct_skb:
1211                if (skb) {
1212                        ice_add_rx_frag(rx_ring, rx_buf, skb, size);
1213                } else if (likely(xdp.data)) {
1214                        if (ice_ring_uses_build_skb(rx_ring))
1215                                skb = ice_build_skb(rx_ring, rx_buf, &xdp);
1216                        else
1217                                skb = ice_construct_skb(rx_ring, rx_buf, &xdp);
1218                }
1219                /* exit if we failed to retrieve a buffer */
1220                if (!skb) {
1221                        rx_ring->rx_stats.alloc_buf_failed++;
1222                        if (rx_buf)
1223                                rx_buf->pagecnt_bias++;
1224                        break;
1225                }
1226
1227                ice_put_rx_buf(rx_ring, rx_buf, rx_buf_pgcnt);
1228                cleaned_count++;
1229
1230                /* skip if it is NOP desc */
1231                if (ice_is_non_eop(rx_ring, rx_desc))
1232                        continue;
1233
1234                stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S);
1235                if (unlikely(ice_test_staterr(rx_desc->wb.status_error0,
1236                                              stat_err_bits))) {
1237                        dev_kfree_skb_any(skb);
1238                        continue;
1239                }
1240
1241                vlan_tag = ice_get_vlan_tag_from_rx_desc(rx_desc);
1242
1243                /* pad the skb if needed, to make a valid ethernet frame */
1244                if (eth_skb_pad(skb)) {
1245                        skb = NULL;
1246                        continue;
1247                }
1248
1249                /* probably a little skewed due to removing CRC */
1250                total_rx_bytes += skb->len;
1251
1252                /* populate checksum, VLAN, and protocol */
1253                rx_ptype = le16_to_cpu(rx_desc->wb.ptype_flex_flags0) &
1254                        ICE_RX_FLEX_DESC_PTYPE_M;
1255
1256                ice_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
1257
1258                ice_trace(clean_rx_irq_indicate, rx_ring, rx_desc, skb);
1259                /* send completed skb up the stack */
1260                ice_receive_skb(rx_ring, skb, vlan_tag);
1261                skb = NULL;
1262
1263                /* update budget accounting */
1264                total_rx_pkts++;
1265        }
1266
1267        /* return up to cleaned_count buffers to hardware */
1268        failure = ice_alloc_rx_bufs(rx_ring, cleaned_count);
1269
1270        if (xdp_prog)
1271                ice_finalize_xdp_rx(xdp_ring, xdp_xmit);
1272        rx_ring->skb = skb;
1273
1274        ice_update_rx_ring_stats(rx_ring, total_rx_pkts, total_rx_bytes);
1275
1276        /* guarantee a trip back through this routine if there was a failure */
1277        return failure ? budget : (int)total_rx_pkts;
1278}
1279
1280static void __ice_update_sample(struct ice_q_vector *q_vector,
1281                                struct ice_ring_container *rc,
1282                                struct dim_sample *sample,
1283                                bool is_tx)
1284{
1285        u64 packets = 0, bytes = 0;
1286
1287        if (is_tx) {
1288                struct ice_tx_ring *tx_ring;
1289
1290                ice_for_each_tx_ring(tx_ring, *rc) {
1291                        packets += tx_ring->stats.pkts;
1292                        bytes += tx_ring->stats.bytes;
1293                }
1294        } else {
1295                struct ice_rx_ring *rx_ring;
1296
1297                ice_for_each_rx_ring(rx_ring, *rc) {
1298                        packets += rx_ring->stats.pkts;
1299                        bytes += rx_ring->stats.bytes;
1300                }
1301        }
1302
1303        dim_update_sample(q_vector->total_events, packets, bytes, sample);
1304        sample->comp_ctr = 0;
1305
1306        /* if dim settings get stale, like when not updated for 1
1307         * second or longer, force it to start again. This addresses the
1308         * frequent case of an idle queue being switched to by the
1309         * scheduler. The 1,000 here means 1,000 milliseconds.
1310         */
1311        if (ktime_ms_delta(sample->time, rc->dim.start_sample.time) >= 1000)
1312                rc->dim.state = DIM_START_MEASURE;
1313}
1314
1315/**
1316 * ice_net_dim - Update net DIM algorithm
1317 * @q_vector: the vector associated with the interrupt
1318 *
1319 * Create a DIM sample and notify net_dim() so that it can possibly decide
1320 * a new ITR value based on incoming packets, bytes, and interrupts.
1321 *
1322 * This function is a no-op if the ring is not configured to dynamic ITR.
1323 */
1324static void ice_net_dim(struct ice_q_vector *q_vector)
1325{
1326        struct ice_ring_container *tx = &q_vector->tx;
1327        struct ice_ring_container *rx = &q_vector->rx;
1328
1329        if (ITR_IS_DYNAMIC(tx)) {
1330                struct dim_sample dim_sample;
1331
1332                __ice_update_sample(q_vector, tx, &dim_sample, true);
1333                net_dim(&tx->dim, dim_sample);
1334        }
1335
1336        if (ITR_IS_DYNAMIC(rx)) {
1337                struct dim_sample dim_sample;
1338
1339                __ice_update_sample(q_vector, rx, &dim_sample, false);
1340                net_dim(&rx->dim, dim_sample);
1341        }
1342}
1343
1344/**
1345 * ice_buildreg_itr - build value for writing to the GLINT_DYN_CTL register
1346 * @itr_idx: interrupt throttling index
1347 * @itr: interrupt throttling value in usecs
1348 */
1349static u32 ice_buildreg_itr(u16 itr_idx, u16 itr)
1350{
1351        /* The ITR value is reported in microseconds, and the register value is
1352         * recorded in 2 microsecond units. For this reason we only need to
1353         * shift by the GLINT_DYN_CTL_INTERVAL_S - ICE_ITR_GRAN_S to apply this
1354         * granularity as a shift instead of division. The mask makes sure the
1355         * ITR value is never odd so we don't accidentally write into the field
1356         * prior to the ITR field.
1357         */
1358        itr &= ICE_ITR_MASK;
1359
1360        return GLINT_DYN_CTL_INTENA_M | GLINT_DYN_CTL_CLEARPBA_M |
1361                (itr_idx << GLINT_DYN_CTL_ITR_INDX_S) |
1362                (itr << (GLINT_DYN_CTL_INTERVAL_S - ICE_ITR_GRAN_S));
1363}
1364
1365/**
1366 * ice_enable_interrupt - re-enable MSI-X interrupt
1367 * @q_vector: the vector associated with the interrupt to enable
1368 *
1369 * If the VSI is down, the interrupt will not be re-enabled. Also,
1370 * when enabling the interrupt always reset the wb_on_itr to false
1371 * and trigger a software interrupt to clean out internal state.
1372 */
1373static void ice_enable_interrupt(struct ice_q_vector *q_vector)
1374{
1375        struct ice_vsi *vsi = q_vector->vsi;
1376        bool wb_en = q_vector->wb_on_itr;
1377        u32 itr_val;
1378
1379        if (test_bit(ICE_DOWN, vsi->state))
1380                return;
1381
1382        /* trigger an ITR delayed software interrupt when exiting busy poll, to
1383         * make sure to catch any pending cleanups that might have been missed
1384         * due to interrupt state transition. If busy poll or poll isn't
1385         * enabled, then don't update ITR, and just enable the interrupt.
1386         */
1387        if (!wb_en) {
1388                itr_val = ice_buildreg_itr(ICE_ITR_NONE, 0);
1389        } else {
1390                q_vector->wb_on_itr = false;
1391
1392                /* do two things here with a single write. Set up the third ITR
1393                 * index to be used for software interrupt moderation, and then
1394                 * trigger a software interrupt with a rate limit of 20K on
1395                 * software interrupts, this will help avoid high interrupt
1396                 * loads due to frequently polling and exiting polling.
1397                 */
1398                itr_val = ice_buildreg_itr(ICE_IDX_ITR2, ICE_ITR_20K);
1399                itr_val |= GLINT_DYN_CTL_SWINT_TRIG_M |
1400                           ICE_IDX_ITR2 << GLINT_DYN_CTL_SW_ITR_INDX_S |
1401                           GLINT_DYN_CTL_SW_ITR_INDX_ENA_M;
1402        }
1403        wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx), itr_val);
1404}
1405
1406/**
1407 * ice_set_wb_on_itr - set WB_ON_ITR for this q_vector
1408 * @q_vector: q_vector to set WB_ON_ITR on
1409 *
1410 * We need to tell hardware to write-back completed descriptors even when
1411 * interrupts are disabled. Descriptors will be written back on cache line
1412 * boundaries without WB_ON_ITR enabled, but if we don't enable WB_ON_ITR
1413 * descriptors may not be written back if they don't fill a cache line until
1414 * the next interrupt.
1415 *
1416 * This sets the write-back frequency to whatever was set previously for the
1417 * ITR indices. Also, set the INTENA_MSK bit to make sure hardware knows we
1418 * aren't meddling with the INTENA_M bit.
1419 */
1420static void ice_set_wb_on_itr(struct ice_q_vector *q_vector)
1421{
1422        struct ice_vsi *vsi = q_vector->vsi;
1423
1424        /* already in wb_on_itr mode no need to change it */
1425        if (q_vector->wb_on_itr)
1426                return;
1427
1428        /* use previously set ITR values for all of the ITR indices by
1429         * specifying ICE_ITR_NONE, which will vary in adaptive (AIM) mode and
1430         * be static in non-adaptive mode (user configured)
1431         */
1432        wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx),
1433             ((ICE_ITR_NONE << GLINT_DYN_CTL_ITR_INDX_S) &
1434              GLINT_DYN_CTL_ITR_INDX_M) | GLINT_DYN_CTL_INTENA_MSK_M |
1435             GLINT_DYN_CTL_WB_ON_ITR_M);
1436
1437        q_vector->wb_on_itr = true;
1438}
1439
1440/**
1441 * ice_napi_poll - NAPI polling Rx/Tx cleanup routine
1442 * @napi: napi struct with our devices info in it
1443 * @budget: amount of work driver is allowed to do this pass, in packets
1444 *
1445 * This function will clean all queues associated with a q_vector.
1446 *
1447 * Returns the amount of work done
1448 */
1449int ice_napi_poll(struct napi_struct *napi, int budget)
1450{
1451        struct ice_q_vector *q_vector =
1452                                container_of(napi, struct ice_q_vector, napi);
1453        struct ice_tx_ring *tx_ring;
1454        struct ice_rx_ring *rx_ring;
1455        bool clean_complete = true;
1456        int budget_per_ring;
1457        int work_done = 0;
1458
1459        /* Since the actual Tx work is minimal, we can give the Tx a larger
1460         * budget and be more aggressive about cleaning up the Tx descriptors.
1461         */
1462        ice_for_each_tx_ring(tx_ring, q_vector->tx) {
1463                bool wd;
1464
1465                if (tx_ring->xsk_pool)
1466                        wd = ice_xmit_zc(tx_ring, ICE_DESC_UNUSED(tx_ring), budget);
1467                else if (ice_ring_is_xdp(tx_ring))
1468                        wd = true;
1469                else
1470                        wd = ice_clean_tx_irq(tx_ring, budget);
1471
1472                if (!wd)
1473                        clean_complete = false;
1474        }
1475
1476        /* Handle case where we are called by netpoll with a budget of 0 */
1477        if (unlikely(budget <= 0))
1478                return budget;
1479
1480        /* normally we have 1 Rx ring per q_vector */
1481        if (unlikely(q_vector->num_ring_rx > 1))
1482                /* We attempt to distribute budget to each Rx queue fairly, but
1483                 * don't allow the budget to go below 1 because that would exit
1484                 * polling early.
1485                 */
1486                budget_per_ring = max_t(int, budget / q_vector->num_ring_rx, 1);
1487        else
1488                /* Max of 1 Rx ring in this q_vector so give it the budget */
1489                budget_per_ring = budget;
1490
1491        ice_for_each_rx_ring(rx_ring, q_vector->rx) {
1492                int cleaned;
1493
1494                /* A dedicated path for zero-copy allows making a single
1495                 * comparison in the irq context instead of many inside the
1496                 * ice_clean_rx_irq function and makes the codebase cleaner.
1497                 */
1498                cleaned = rx_ring->xsk_pool ?
1499                          ice_clean_rx_irq_zc(rx_ring, budget_per_ring) :
1500                          ice_clean_rx_irq(rx_ring, budget_per_ring);
1501                work_done += cleaned;
1502                /* if we clean as many as budgeted, we must not be done */
1503                if (cleaned >= budget_per_ring)
1504                        clean_complete = false;
1505        }
1506
1507        /* If work not completed, return budget and polling will return */
1508        if (!clean_complete) {
1509                /* Set the writeback on ITR so partial completions of
1510                 * cache-lines will still continue even if we're polling.
1511                 */
1512                ice_set_wb_on_itr(q_vector);
1513                return budget;
1514        }
1515
1516        /* Exit the polling mode, but don't re-enable interrupts if stack might
1517         * poll us due to busy-polling
1518         */
1519        if (napi_complete_done(napi, work_done)) {
1520                ice_net_dim(q_vector);
1521                ice_enable_interrupt(q_vector);
1522        } else {
1523                ice_set_wb_on_itr(q_vector);
1524        }
1525
1526        return min_t(int, work_done, budget - 1);
1527}
1528
1529/**
1530 * __ice_maybe_stop_tx - 2nd level check for Tx stop conditions
1531 * @tx_ring: the ring to be checked
1532 * @size: the size buffer we want to assure is available
1533 *
1534 * Returns -EBUSY if a stop is needed, else 0
1535 */
1536static int __ice_maybe_stop_tx(struct ice_tx_ring *tx_ring, unsigned int size)
1537{
1538        netif_tx_stop_queue(txring_txq(tx_ring));
1539        /* Memory barrier before checking head and tail */
1540        smp_mb();
1541
1542        /* Check again in a case another CPU has just made room available. */
1543        if (likely(ICE_DESC_UNUSED(tx_ring) < size))
1544                return -EBUSY;
1545
1546        /* A reprieve! - use start_queue because it doesn't call schedule */
1547        netif_tx_start_queue(txring_txq(tx_ring));
1548        ++tx_ring->tx_stats.restart_q;
1549        return 0;
1550}
1551
1552/**
1553 * ice_maybe_stop_tx - 1st level check for Tx stop conditions
1554 * @tx_ring: the ring to be checked
1555 * @size:    the size buffer we want to assure is available
1556 *
1557 * Returns 0 if stop is not needed
1558 */
1559static int ice_maybe_stop_tx(struct ice_tx_ring *tx_ring, unsigned int size)
1560{
1561        if (likely(ICE_DESC_UNUSED(tx_ring) >= size))
1562                return 0;
1563
1564        return __ice_maybe_stop_tx(tx_ring, size);
1565}
1566
1567/**
1568 * ice_tx_map - Build the Tx descriptor
1569 * @tx_ring: ring to send buffer on
1570 * @first: first buffer info buffer to use
1571 * @off: pointer to struct that holds offload parameters
1572 *
1573 * This function loops over the skb data pointed to by *first
1574 * and gets a physical address for each memory location and programs
1575 * it and the length into the transmit descriptor.
1576 */
1577static void
1578ice_tx_map(struct ice_tx_ring *tx_ring, struct ice_tx_buf *first,
1579           struct ice_tx_offload_params *off)
1580{
1581        u64 td_offset, td_tag, td_cmd;
1582        u16 i = tx_ring->next_to_use;
1583        unsigned int data_len, size;
1584        struct ice_tx_desc *tx_desc;
1585        struct ice_tx_buf *tx_buf;
1586        struct sk_buff *skb;
1587        skb_frag_t *frag;
1588        dma_addr_t dma;
1589        bool kick;
1590
1591        td_tag = off->td_l2tag1;
1592        td_cmd = off->td_cmd;
1593        td_offset = off->td_offset;
1594        skb = first->skb;
1595
1596        data_len = skb->data_len;
1597        size = skb_headlen(skb);
1598
1599        tx_desc = ICE_TX_DESC(tx_ring, i);
1600
1601        if (first->tx_flags & ICE_TX_FLAGS_HW_VLAN) {
1602                td_cmd |= (u64)ICE_TX_DESC_CMD_IL2TAG1;
1603                td_tag = (first->tx_flags & ICE_TX_FLAGS_VLAN_M) >>
1604                          ICE_TX_FLAGS_VLAN_S;
1605        }
1606
1607        dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
1608
1609        tx_buf = first;
1610
1611        for (frag = &skb_shinfo(skb)->frags[0];; frag++) {
1612                unsigned int max_data = ICE_MAX_DATA_PER_TXD_ALIGNED;
1613
1614                if (dma_mapping_error(tx_ring->dev, dma))
1615                        goto dma_error;
1616
1617                /* record length, and DMA address */
1618                dma_unmap_len_set(tx_buf, len, size);
1619                dma_unmap_addr_set(tx_buf, dma, dma);
1620
1621                /* align size to end of page */
1622                max_data += -dma & (ICE_MAX_READ_REQ_SIZE - 1);
1623                tx_desc->buf_addr = cpu_to_le64(dma);
1624
1625                /* account for data chunks larger than the hardware
1626                 * can handle
1627                 */
1628                while (unlikely(size > ICE_MAX_DATA_PER_TXD)) {
1629                        tx_desc->cmd_type_offset_bsz =
1630                                ice_build_ctob(td_cmd, td_offset, max_data,
1631                                               td_tag);
1632
1633                        tx_desc++;
1634                        i++;
1635
1636                        if (i == tx_ring->count) {
1637                                tx_desc = ICE_TX_DESC(tx_ring, 0);
1638                                i = 0;
1639                        }
1640
1641                        dma += max_data;
1642                        size -= max_data;
1643
1644                        max_data = ICE_MAX_DATA_PER_TXD_ALIGNED;
1645                        tx_desc->buf_addr = cpu_to_le64(dma);
1646                }
1647
1648                if (likely(!data_len))
1649                        break;
1650
1651                tx_desc->cmd_type_offset_bsz = ice_build_ctob(td_cmd, td_offset,
1652                                                              size, td_tag);
1653
1654                tx_desc++;
1655                i++;
1656
1657                if (i == tx_ring->count) {
1658                        tx_desc = ICE_TX_DESC(tx_ring, 0);
1659                        i = 0;
1660                }
1661
1662                size = skb_frag_size(frag);
1663                data_len -= size;
1664
1665                dma = skb_frag_dma_map(tx_ring->dev, frag, 0, size,
1666                                       DMA_TO_DEVICE);
1667
1668                tx_buf = &tx_ring->tx_buf[i];
1669        }
1670
1671        /* record SW timestamp if HW timestamp is not available */
1672        skb_tx_timestamp(first->skb);
1673
1674        i++;
1675        if (i == tx_ring->count)
1676                i = 0;
1677
1678        /* write last descriptor with RS and EOP bits */
1679        td_cmd |= (u64)ICE_TXD_LAST_DESC_CMD;
1680        tx_desc->cmd_type_offset_bsz =
1681                        ice_build_ctob(td_cmd, td_offset, size, td_tag);
1682
1683        /* Force memory writes to complete before letting h/w know there
1684         * are new descriptors to fetch.
1685         *
1686         * We also use this memory barrier to make certain all of the
1687         * status bits have been updated before next_to_watch is written.
1688         */
1689        wmb();
1690
1691        /* set next_to_watch value indicating a packet is present */
1692        first->next_to_watch = tx_desc;
1693
1694        tx_ring->next_to_use = i;
1695
1696        ice_maybe_stop_tx(tx_ring, DESC_NEEDED);
1697
1698        /* notify HW of packet */
1699        kick = __netdev_tx_sent_queue(txring_txq(tx_ring), first->bytecount,
1700                                      netdev_xmit_more());
1701        if (kick)
1702                /* notify HW of packet */
1703                writel(i, tx_ring->tail);
1704
1705        return;
1706
1707dma_error:
1708        /* clear DMA mappings for failed tx_buf map */
1709        for (;;) {
1710                tx_buf = &tx_ring->tx_buf[i];
1711                ice_unmap_and_free_tx_buf(tx_ring, tx_buf);
1712                if (tx_buf == first)
1713                        break;
1714                if (i == 0)
1715                        i = tx_ring->count;
1716                i--;
1717        }
1718
1719        tx_ring->next_to_use = i;
1720}
1721
1722/**
1723 * ice_tx_csum - Enable Tx checksum offloads
1724 * @first: pointer to the first descriptor
1725 * @off: pointer to struct that holds offload parameters
1726 *
1727 * Returns 0 or error (negative) if checksum offload can't happen, 1 otherwise.
1728 */
1729static
1730int ice_tx_csum(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
1731{
1732        u32 l4_len = 0, l3_len = 0, l2_len = 0;
1733        struct sk_buff *skb = first->skb;
1734        union {
1735                struct iphdr *v4;
1736                struct ipv6hdr *v6;
1737                unsigned char *hdr;
1738        } ip;
1739        union {
1740                struct tcphdr *tcp;
1741                unsigned char *hdr;
1742        } l4;
1743        __be16 frag_off, protocol;
1744        unsigned char *exthdr;
1745        u32 offset, cmd = 0;
1746        u8 l4_proto = 0;
1747
1748        if (skb->ip_summed != CHECKSUM_PARTIAL)
1749                return 0;
1750
1751        ip.hdr = skb_network_header(skb);
1752        l4.hdr = skb_transport_header(skb);
1753
1754        /* compute outer L2 header size */
1755        l2_len = ip.hdr - skb->data;
1756        offset = (l2_len / 2) << ICE_TX_DESC_LEN_MACLEN_S;
1757
1758        protocol = vlan_get_protocol(skb);
1759
1760        if (protocol == htons(ETH_P_IP))
1761                first->tx_flags |= ICE_TX_FLAGS_IPV4;
1762        else if (protocol == htons(ETH_P_IPV6))
1763                first->tx_flags |= ICE_TX_FLAGS_IPV6;
1764
1765        if (skb->encapsulation) {
1766                bool gso_ena = false;
1767                u32 tunnel = 0;
1768
1769                /* define outer network header type */
1770                if (first->tx_flags & ICE_TX_FLAGS_IPV4) {
1771                        tunnel |= (first->tx_flags & ICE_TX_FLAGS_TSO) ?
1772                                  ICE_TX_CTX_EIPT_IPV4 :
1773                                  ICE_TX_CTX_EIPT_IPV4_NO_CSUM;
1774                        l4_proto = ip.v4->protocol;
1775                } else if (first->tx_flags & ICE_TX_FLAGS_IPV6) {
1776                        int ret;
1777
1778                        tunnel |= ICE_TX_CTX_EIPT_IPV6;
1779                        exthdr = ip.hdr + sizeof(*ip.v6);
1780                        l4_proto = ip.v6->nexthdr;
1781                        ret = ipv6_skip_exthdr(skb, exthdr - skb->data,
1782                                               &l4_proto, &frag_off);
1783                        if (ret < 0)
1784                                return -1;
1785                }
1786
1787                /* define outer transport */
1788                switch (l4_proto) {
1789                case IPPROTO_UDP:
1790                        tunnel |= ICE_TXD_CTX_UDP_TUNNELING;
1791                        first->tx_flags |= ICE_TX_FLAGS_TUNNEL;
1792                        break;
1793                case IPPROTO_GRE:
1794                        tunnel |= ICE_TXD_CTX_GRE_TUNNELING;
1795                        first->tx_flags |= ICE_TX_FLAGS_TUNNEL;
1796                        break;
1797                case IPPROTO_IPIP:
1798                case IPPROTO_IPV6:
1799                        first->tx_flags |= ICE_TX_FLAGS_TUNNEL;
1800                        l4.hdr = skb_inner_network_header(skb);
1801                        break;
1802                default:
1803                        if (first->tx_flags & ICE_TX_FLAGS_TSO)
1804                                return -1;
1805
1806                        skb_checksum_help(skb);
1807                        return 0;
1808                }
1809
1810                /* compute outer L3 header size */
1811                tunnel |= ((l4.hdr - ip.hdr) / 4) <<
1812                          ICE_TXD_CTX_QW0_EIPLEN_S;
1813
1814                /* switch IP header pointer from outer to inner header */
1815                ip.hdr = skb_inner_network_header(skb);
1816
1817                /* compute tunnel header size */
1818                tunnel |= ((ip.hdr - l4.hdr) / 2) <<
1819                           ICE_TXD_CTX_QW0_NATLEN_S;
1820
1821                gso_ena = skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL;
1822                /* indicate if we need to offload outer UDP header */
1823                if ((first->tx_flags & ICE_TX_FLAGS_TSO) && !gso_ena &&
1824                    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM))
1825                        tunnel |= ICE_TXD_CTX_QW0_L4T_CS_M;
1826
1827                /* record tunnel offload values */
1828                off->cd_tunnel_params |= tunnel;
1829
1830                /* set DTYP=1 to indicate that it's an Tx context descriptor
1831                 * in IPsec tunnel mode with Tx offloads in Quad word 1
1832                 */
1833                off->cd_qw1 |= (u64)ICE_TX_DESC_DTYPE_CTX;
1834
1835                /* switch L4 header pointer from outer to inner */
1836                l4.hdr = skb_inner_transport_header(skb);
1837                l4_proto = 0;
1838
1839                /* reset type as we transition from outer to inner headers */
1840                first->tx_flags &= ~(ICE_TX_FLAGS_IPV4 | ICE_TX_FLAGS_IPV6);
1841                if (ip.v4->version == 4)
1842                        first->tx_flags |= ICE_TX_FLAGS_IPV4;
1843                if (ip.v6->version == 6)
1844                        first->tx_flags |= ICE_TX_FLAGS_IPV6;
1845        }
1846
1847        /* Enable IP checksum offloads */
1848        if (first->tx_flags & ICE_TX_FLAGS_IPV4) {
1849                l4_proto = ip.v4->protocol;
1850                /* the stack computes the IP header already, the only time we
1851                 * need the hardware to recompute it is in the case of TSO.
1852                 */
1853                if (first->tx_flags & ICE_TX_FLAGS_TSO)
1854                        cmd |= ICE_TX_DESC_CMD_IIPT_IPV4_CSUM;
1855                else
1856                        cmd |= ICE_TX_DESC_CMD_IIPT_IPV4;
1857
1858        } else if (first->tx_flags & ICE_TX_FLAGS_IPV6) {
1859                cmd |= ICE_TX_DESC_CMD_IIPT_IPV6;
1860                exthdr = ip.hdr + sizeof(*ip.v6);
1861                l4_proto = ip.v6->nexthdr;
1862                if (l4.hdr != exthdr)
1863                        ipv6_skip_exthdr(skb, exthdr - skb->data, &l4_proto,
1864                                         &frag_off);
1865        } else {
1866                return -1;
1867        }
1868
1869        /* compute inner L3 header size */
1870        l3_len = l4.hdr - ip.hdr;
1871        offset |= (l3_len / 4) << ICE_TX_DESC_LEN_IPLEN_S;
1872
1873        /* Enable L4 checksum offloads */
1874        switch (l4_proto) {
1875        case IPPROTO_TCP:
1876                /* enable checksum offloads */
1877                cmd |= ICE_TX_DESC_CMD_L4T_EOFT_TCP;
1878                l4_len = l4.tcp->doff;
1879                offset |= l4_len << ICE_TX_DESC_LEN_L4_LEN_S;
1880                break;
1881        case IPPROTO_UDP:
1882                /* enable UDP checksum offload */
1883                cmd |= ICE_TX_DESC_CMD_L4T_EOFT_UDP;
1884                l4_len = (sizeof(struct udphdr) >> 2);
1885                offset |= l4_len << ICE_TX_DESC_LEN_L4_LEN_S;
1886                break;
1887        case IPPROTO_SCTP:
1888                /* enable SCTP checksum offload */
1889                cmd |= ICE_TX_DESC_CMD_L4T_EOFT_SCTP;
1890                l4_len = sizeof(struct sctphdr) >> 2;
1891                offset |= l4_len << ICE_TX_DESC_LEN_L4_LEN_S;
1892                break;
1893
1894        default:
1895                if (first->tx_flags & ICE_TX_FLAGS_TSO)
1896                        return -1;
1897                skb_checksum_help(skb);
1898                return 0;
1899        }
1900
1901        off->td_cmd |= cmd;
1902        off->td_offset |= offset;
1903        return 1;
1904}
1905
1906/**
1907 * ice_tx_prepare_vlan_flags - prepare generic Tx VLAN tagging flags for HW
1908 * @tx_ring: ring to send buffer on
1909 * @first: pointer to struct ice_tx_buf
1910 *
1911 * Checks the skb and set up correspondingly several generic transmit flags
1912 * related to VLAN tagging for the HW, such as VLAN, DCB, etc.
1913 */
1914static void
1915ice_tx_prepare_vlan_flags(struct ice_tx_ring *tx_ring, struct ice_tx_buf *first)
1916{
1917        struct sk_buff *skb = first->skb;
1918
1919        /* nothing left to do, software offloaded VLAN */
1920        if (!skb_vlan_tag_present(skb) && eth_type_vlan(skb->protocol))
1921                return;
1922
1923        /* the VLAN ethertype/tpid is determined by VSI configuration and netdev
1924         * feature flags, which the driver only allows either 802.1Q or 802.1ad
1925         * VLAN offloads exclusively so we only care about the VLAN ID here
1926         */
1927        if (skb_vlan_tag_present(skb)) {
1928                first->tx_flags |= skb_vlan_tag_get(skb) << ICE_TX_FLAGS_VLAN_S;
1929                if (tx_ring->flags & ICE_TX_FLAGS_RING_VLAN_L2TAG2)
1930                        first->tx_flags |= ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN;
1931                else
1932                        first->tx_flags |= ICE_TX_FLAGS_HW_VLAN;
1933        }
1934
1935        ice_tx_prepare_vlan_flags_dcb(tx_ring, first);
1936}
1937
1938/**
1939 * ice_tso - computes mss and TSO length to prepare for TSO
1940 * @first: pointer to struct ice_tx_buf
1941 * @off: pointer to struct that holds offload parameters
1942 *
1943 * Returns 0 or error (negative) if TSO can't happen, 1 otherwise.
1944 */
1945static
1946int ice_tso(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
1947{
1948        struct sk_buff *skb = first->skb;
1949        union {
1950                struct iphdr *v4;
1951                struct ipv6hdr *v6;
1952                unsigned char *hdr;
1953        } ip;
1954        union {
1955                struct tcphdr *tcp;
1956                struct udphdr *udp;
1957                unsigned char *hdr;
1958        } l4;
1959        u64 cd_mss, cd_tso_len;
1960        u32 paylen;
1961        u8 l4_start;
1962        int err;
1963
1964        if (skb->ip_summed != CHECKSUM_PARTIAL)
1965                return 0;
1966
1967        if (!skb_is_gso(skb))
1968                return 0;
1969
1970        err = skb_cow_head(skb, 0);
1971        if (err < 0)
1972                return err;
1973
1974        /* cppcheck-suppress unreadVariable */
1975        ip.hdr = skb_network_header(skb);
1976        l4.hdr = skb_transport_header(skb);
1977
1978        /* initialize outer IP header fields */
1979        if (ip.v4->version == 4) {
1980                ip.v4->tot_len = 0;
1981                ip.v4->check = 0;
1982        } else {
1983                ip.v6->payload_len = 0;
1984        }
1985
1986        if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
1987                                         SKB_GSO_GRE_CSUM |
1988                                         SKB_GSO_IPXIP4 |
1989                                         SKB_GSO_IPXIP6 |
1990                                         SKB_GSO_UDP_TUNNEL |
1991                                         SKB_GSO_UDP_TUNNEL_CSUM)) {
1992                if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
1993                    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)) {
1994                        l4.udp->len = 0;
1995
1996                        /* determine offset of outer transport header */
1997                        l4_start = (u8)(l4.hdr - skb->data);
1998
1999                        /* remove payload length from outer checksum */
2000                        paylen = skb->len - l4_start;
2001                        csum_replace_by_diff(&l4.udp->check,
2002                                             (__force __wsum)htonl(paylen));
2003                }
2004
2005                /* reset pointers to inner headers */
2006
2007                /* cppcheck-suppress unreadVariable */
2008                ip.hdr = skb_inner_network_header(skb);
2009                l4.hdr = skb_inner_transport_header(skb);
2010
2011                /* initialize inner IP header fields */
2012                if (ip.v4->version == 4) {
2013                        ip.v4->tot_len = 0;
2014                        ip.v4->check = 0;
2015                } else {
2016                        ip.v6->payload_len = 0;
2017                }
2018        }
2019
2020        /* determine offset of transport header */
2021        l4_start = (u8)(l4.hdr - skb->data);
2022
2023        /* remove payload length from checksum */
2024        paylen = skb->len - l4_start;
2025
2026        if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
2027                csum_replace_by_diff(&l4.udp->check,
2028                                     (__force __wsum)htonl(paylen));
2029                /* compute length of UDP segmentation header */
2030                off->header_len = (u8)sizeof(l4.udp) + l4_start;
2031        } else {
2032                csum_replace_by_diff(&l4.tcp->check,
2033                                     (__force __wsum)htonl(paylen));
2034                /* compute length of TCP segmentation header */
2035                off->header_len = (u8)((l4.tcp->doff * 4) + l4_start);
2036        }
2037
2038        /* update gso_segs and bytecount */
2039        first->gso_segs = skb_shinfo(skb)->gso_segs;
2040        first->bytecount += (first->gso_segs - 1) * off->header_len;
2041
2042        cd_tso_len = skb->len - off->header_len;
2043        cd_mss = skb_shinfo(skb)->gso_size;
2044
2045        /* record cdesc_qw1 with TSO parameters */
2046        off->cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX |
2047                             (ICE_TX_CTX_DESC_TSO << ICE_TXD_CTX_QW1_CMD_S) |
2048                             (cd_tso_len << ICE_TXD_CTX_QW1_TSO_LEN_S) |
2049                             (cd_mss << ICE_TXD_CTX_QW1_MSS_S));
2050        first->tx_flags |= ICE_TX_FLAGS_TSO;
2051        return 1;
2052}
2053
2054/**
2055 * ice_txd_use_count  - estimate the number of descriptors needed for Tx
2056 * @size: transmit request size in bytes
2057 *
2058 * Due to hardware alignment restrictions (4K alignment), we need to
2059 * assume that we can have no more than 12K of data per descriptor, even
2060 * though each descriptor can take up to 16K - 1 bytes of aligned memory.
2061 * Thus, we need to divide by 12K. But division is slow! Instead,
2062 * we decompose the operation into shifts and one relatively cheap
2063 * multiply operation.
2064 *
2065 * To divide by 12K, we first divide by 4K, then divide by 3:
2066 *     To divide by 4K, shift right by 12 bits
2067 *     To divide by 3, multiply by 85, then divide by 256
2068 *     (Divide by 256 is done by shifting right by 8 bits)
2069 * Finally, we add one to round up. Because 256 isn't an exact multiple of
2070 * 3, we'll underestimate near each multiple of 12K. This is actually more
2071 * accurate as we have 4K - 1 of wiggle room that we can fit into the last
2072 * segment. For our purposes this is accurate out to 1M which is orders of
2073 * magnitude greater than our largest possible GSO size.
2074 *
2075 * This would then be implemented as:
2076 *     return (((size >> 12) * 85) >> 8) + ICE_DESCS_FOR_SKB_DATA_PTR;
2077 *
2078 * Since multiplication and division are commutative, we can reorder
2079 * operations into:
2080 *     return ((size * 85) >> 20) + ICE_DESCS_FOR_SKB_DATA_PTR;
2081 */
2082static unsigned int ice_txd_use_count(unsigned int size)
2083{
2084        return ((size * 85) >> 20) + ICE_DESCS_FOR_SKB_DATA_PTR;
2085}
2086
2087/**
2088 * ice_xmit_desc_count - calculate number of Tx descriptors needed
2089 * @skb: send buffer
2090 *
2091 * Returns number of data descriptors needed for this skb.
2092 */
2093static unsigned int ice_xmit_desc_count(struct sk_buff *skb)
2094{
2095        const skb_frag_t *frag = &skb_shinfo(skb)->frags[0];
2096        unsigned int nr_frags = skb_shinfo(skb)->nr_frags;
2097        unsigned int count = 0, size = skb_headlen(skb);
2098
2099        for (;;) {
2100                count += ice_txd_use_count(size);
2101
2102                if (!nr_frags--)
2103                        break;
2104
2105                size = skb_frag_size(frag++);
2106        }
2107
2108        return count;
2109}
2110
2111/**
2112 * __ice_chk_linearize - Check if there are more than 8 buffers per packet
2113 * @skb: send buffer
2114 *
2115 * Note: This HW can't DMA more than 8 buffers to build a packet on the wire
2116 * and so we need to figure out the cases where we need to linearize the skb.
2117 *
2118 * For TSO we need to count the TSO header and segment payload separately.
2119 * As such we need to check cases where we have 7 fragments or more as we
2120 * can potentially require 9 DMA transactions, 1 for the TSO header, 1 for
2121 * the segment payload in the first descriptor, and another 7 for the
2122 * fragments.
2123 */
2124static bool __ice_chk_linearize(struct sk_buff *skb)
2125{
2126        const skb_frag_t *frag, *stale;
2127        int nr_frags, sum;
2128
2129        /* no need to check if number of frags is less than 7 */
2130        nr_frags = skb_shinfo(skb)->nr_frags;
2131        if (nr_frags < (ICE_MAX_BUF_TXD - 1))
2132                return false;
2133
2134        /* We need to walk through the list and validate that each group
2135         * of 6 fragments totals at least gso_size.
2136         */
2137        nr_frags -= ICE_MAX_BUF_TXD - 2;
2138        frag = &skb_shinfo(skb)->frags[0];
2139
2140        /* Initialize size to the negative value of gso_size minus 1. We
2141         * use this as the worst case scenario in which the frag ahead
2142         * of us only provides one byte which is why we are limited to 6
2143         * descriptors for a single transmit as the header and previous
2144         * fragment are already consuming 2 descriptors.
2145         */
2146        sum = 1 - skb_shinfo(skb)->gso_size;
2147
2148        /* Add size of frags 0 through 4 to create our initial sum */
2149        sum += skb_frag_size(frag++);
2150        sum += skb_frag_size(frag++);
2151        sum += skb_frag_size(frag++);
2152        sum += skb_frag_size(frag++);
2153        sum += skb_frag_size(frag++);
2154
2155        /* Walk through fragments adding latest fragment, testing it, and
2156         * then removing stale fragments from the sum.
2157         */
2158        for (stale = &skb_shinfo(skb)->frags[0];; stale++) {
2159                int stale_size = skb_frag_size(stale);
2160
2161                sum += skb_frag_size(frag++);
2162
2163                /* The stale fragment may present us with a smaller
2164                 * descriptor than the actual fragment size. To account
2165                 * for that we need to remove all the data on the front and
2166                 * figure out what the remainder would be in the last
2167                 * descriptor associated with the fragment.
2168                 */
2169                if (stale_size > ICE_MAX_DATA_PER_TXD) {
2170                        int align_pad = -(skb_frag_off(stale)) &
2171                                        (ICE_MAX_READ_REQ_SIZE - 1);
2172
2173                        sum -= align_pad;
2174                        stale_size -= align_pad;
2175
2176                        do {
2177                                sum -= ICE_MAX_DATA_PER_TXD_ALIGNED;
2178                                stale_size -= ICE_MAX_DATA_PER_TXD_ALIGNED;
2179                        } while (stale_size > ICE_MAX_DATA_PER_TXD);
2180                }
2181
2182                /* if sum is negative we failed to make sufficient progress */
2183                if (sum < 0)
2184                        return true;
2185
2186                if (!nr_frags--)
2187                        break;
2188
2189                sum -= stale_size;
2190        }
2191
2192        return false;
2193}
2194
2195/**
2196 * ice_chk_linearize - Check if there are more than 8 fragments per packet
2197 * @skb:      send buffer
2198 * @count:    number of buffers used
2199 *
2200 * Note: Our HW can't scatter-gather more than 8 fragments to build
2201 * a packet on the wire and so we need to figure out the cases where we
2202 * need to linearize the skb.
2203 */
2204static bool ice_chk_linearize(struct sk_buff *skb, unsigned int count)
2205{
2206        /* Both TSO and single send will work if count is less than 8 */
2207        if (likely(count < ICE_MAX_BUF_TXD))
2208                return false;
2209
2210        if (skb_is_gso(skb))
2211                return __ice_chk_linearize(skb);
2212
2213        /* we can support up to 8 data buffers for a single send */
2214        return count != ICE_MAX_BUF_TXD;
2215}
2216
2217/**
2218 * ice_tstamp - set up context descriptor for hardware timestamp
2219 * @tx_ring: pointer to the Tx ring to send buffer on
2220 * @skb: pointer to the SKB we're sending
2221 * @first: Tx buffer
2222 * @off: Tx offload parameters
2223 */
2224static void
2225ice_tstamp(struct ice_tx_ring *tx_ring, struct sk_buff *skb,
2226           struct ice_tx_buf *first, struct ice_tx_offload_params *off)
2227{
2228        s8 idx;
2229
2230        /* only timestamp the outbound packet if the user has requested it */
2231        if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)))
2232                return;
2233
2234        if (!tx_ring->ptp_tx)
2235                return;
2236
2237        /* Tx timestamps cannot be sampled when doing TSO */
2238        if (first->tx_flags & ICE_TX_FLAGS_TSO)
2239                return;
2240
2241        /* Grab an open timestamp slot */
2242        idx = ice_ptp_request_ts(tx_ring->tx_tstamps, skb);
2243        if (idx < 0)
2244                return;
2245
2246        off->cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX |
2247                             (ICE_TX_CTX_DESC_TSYN << ICE_TXD_CTX_QW1_CMD_S) |
2248                             ((u64)idx << ICE_TXD_CTX_QW1_TSO_LEN_S));
2249        first->tx_flags |= ICE_TX_FLAGS_TSYN;
2250}
2251
2252/**
2253 * ice_xmit_frame_ring - Sends buffer on Tx ring
2254 * @skb: send buffer
2255 * @tx_ring: ring to send buffer on
2256 *
2257 * Returns NETDEV_TX_OK if sent, else an error code
2258 */
2259static netdev_tx_t
2260ice_xmit_frame_ring(struct sk_buff *skb, struct ice_tx_ring *tx_ring)
2261{
2262        struct ice_tx_offload_params offload = { 0 };
2263        struct ice_vsi *vsi = tx_ring->vsi;
2264        struct ice_tx_buf *first;
2265        struct ethhdr *eth;
2266        unsigned int count;
2267        int tso, csum;
2268
2269        ice_trace(xmit_frame_ring, tx_ring, skb);
2270
2271        count = ice_xmit_desc_count(skb);
2272        if (ice_chk_linearize(skb, count)) {
2273                if (__skb_linearize(skb))
2274                        goto out_drop;
2275                count = ice_txd_use_count(skb->len);
2276                tx_ring->tx_stats.tx_linearize++;
2277        }
2278
2279        /* need: 1 descriptor per page * PAGE_SIZE/ICE_MAX_DATA_PER_TXD,
2280         *       + 1 desc for skb_head_len/ICE_MAX_DATA_PER_TXD,
2281         *       + 4 desc gap to avoid the cache line where head is,
2282         *       + 1 desc for context descriptor,
2283         * otherwise try next time
2284         */
2285        if (ice_maybe_stop_tx(tx_ring, count + ICE_DESCS_PER_CACHE_LINE +
2286                              ICE_DESCS_FOR_CTX_DESC)) {
2287                tx_ring->tx_stats.tx_busy++;
2288                return NETDEV_TX_BUSY;
2289        }
2290
2291        /* prefetch for bql data which is infrequently used */
2292        netdev_txq_bql_enqueue_prefetchw(txring_txq(tx_ring));
2293
2294        offload.tx_ring = tx_ring;
2295
2296        /* record the location of the first descriptor for this packet */
2297        first = &tx_ring->tx_buf[tx_ring->next_to_use];
2298        first->skb = skb;
2299        first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN);
2300        first->gso_segs = 1;
2301        first->tx_flags = 0;
2302
2303        /* prepare the VLAN tagging flags for Tx */
2304        ice_tx_prepare_vlan_flags(tx_ring, first);
2305        if (first->tx_flags & ICE_TX_FLAGS_HW_OUTER_SINGLE_VLAN) {
2306                offload.cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX |
2307                                        (ICE_TX_CTX_DESC_IL2TAG2 <<
2308                                        ICE_TXD_CTX_QW1_CMD_S));
2309                offload.cd_l2tag2 = (first->tx_flags & ICE_TX_FLAGS_VLAN_M) >>
2310                        ICE_TX_FLAGS_VLAN_S;
2311        }
2312
2313        /* set up TSO offload */
2314        tso = ice_tso(first, &offload);
2315        if (tso < 0)
2316                goto out_drop;
2317
2318        /* always set up Tx checksum offload */
2319        csum = ice_tx_csum(first, &offload);
2320        if (csum < 0)
2321                goto out_drop;
2322
2323        /* allow CONTROL frames egress from main VSI if FW LLDP disabled */
2324        eth = (struct ethhdr *)skb_mac_header(skb);
2325        if (unlikely((skb->priority == TC_PRIO_CONTROL ||
2326                      eth->h_proto == htons(ETH_P_LLDP)) &&
2327                     vsi->type == ICE_VSI_PF &&
2328                     vsi->port_info->qos_cfg.is_sw_lldp))
2329                offload.cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX |
2330                                        ICE_TX_CTX_DESC_SWTCH_UPLINK <<
2331                                        ICE_TXD_CTX_QW1_CMD_S);
2332
2333        ice_tstamp(tx_ring, skb, first, &offload);
2334        if (ice_is_switchdev_running(vsi->back))
2335                ice_eswitch_set_target_vsi(skb, &offload);
2336
2337        if (offload.cd_qw1 & ICE_TX_DESC_DTYPE_CTX) {
2338                struct ice_tx_ctx_desc *cdesc;
2339                u16 i = tx_ring->next_to_use;
2340
2341                /* grab the next descriptor */
2342                cdesc = ICE_TX_CTX_DESC(tx_ring, i);
2343                i++;
2344                tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
2345
2346                /* setup context descriptor */
2347                cdesc->tunneling_params = cpu_to_le32(offload.cd_tunnel_params);
2348                cdesc->l2tag2 = cpu_to_le16(offload.cd_l2tag2);
2349                cdesc->rsvd = cpu_to_le16(0);
2350                cdesc->qw1 = cpu_to_le64(offload.cd_qw1);
2351        }
2352
2353        ice_tx_map(tx_ring, first, &offload);
2354        return NETDEV_TX_OK;
2355
2356out_drop:
2357        ice_trace(xmit_frame_ring_drop, tx_ring, skb);
2358        dev_kfree_skb_any(skb);
2359        return NETDEV_TX_OK;
2360}
2361
2362/**
2363 * ice_start_xmit - Selects the correct VSI and Tx queue to send buffer
2364 * @skb: send buffer
2365 * @netdev: network interface device structure
2366 *
2367 * Returns NETDEV_TX_OK if sent, else an error code
2368 */
2369netdev_tx_t ice_start_xmit(struct sk_buff *skb, struct net_device *netdev)
2370{
2371        struct ice_netdev_priv *np = netdev_priv(netdev);
2372        struct ice_vsi *vsi = np->vsi;
2373        struct ice_tx_ring *tx_ring;
2374
2375        tx_ring = vsi->tx_rings[skb->queue_mapping];
2376
2377        /* hardware can't handle really short frames, hardware padding works
2378         * beyond this point
2379         */
2380        if (skb_put_padto(skb, ICE_MIN_TX_LEN))
2381                return NETDEV_TX_OK;
2382
2383        return ice_xmit_frame_ring(skb, tx_ring);
2384}
2385
2386/**
2387 * ice_get_dscp_up - return the UP/TC value for a SKB
2388 * @dcbcfg: DCB config that contains DSCP to UP/TC mapping
2389 * @skb: SKB to query for info to determine UP/TC
2390 *
2391 * This function is to only be called when the PF is in L3 DSCP PFC mode
2392 */
2393static u8 ice_get_dscp_up(struct ice_dcbx_cfg *dcbcfg, struct sk_buff *skb)
2394{
2395        u8 dscp = 0;
2396
2397        if (skb->protocol == htons(ETH_P_IP))
2398                dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
2399        else if (skb->protocol == htons(ETH_P_IPV6))
2400                dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
2401
2402        return dcbcfg->dscp_map[dscp];
2403}
2404
2405u16
2406ice_select_queue(struct net_device *netdev, struct sk_buff *skb,
2407                 struct net_device *sb_dev)
2408{
2409        struct ice_pf *pf = ice_netdev_to_pf(netdev);
2410        struct ice_dcbx_cfg *dcbcfg;
2411
2412        dcbcfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
2413        if (dcbcfg->pfc_mode == ICE_QOS_MODE_DSCP)
2414                skb->priority = ice_get_dscp_up(dcbcfg, skb);
2415
2416        return netdev_pick_tx(netdev, skb, sb_dev);
2417}
2418
2419/**
2420 * ice_clean_ctrl_tx_irq - interrupt handler for flow director Tx queue
2421 * @tx_ring: tx_ring to clean
2422 */
2423void ice_clean_ctrl_tx_irq(struct ice_tx_ring *tx_ring)
2424{
2425        struct ice_vsi *vsi = tx_ring->vsi;
2426        s16 i = tx_ring->next_to_clean;
2427        int budget = ICE_DFLT_IRQ_WORK;
2428        struct ice_tx_desc *tx_desc;
2429        struct ice_tx_buf *tx_buf;
2430
2431        tx_buf = &tx_ring->tx_buf[i];
2432        tx_desc = ICE_TX_DESC(tx_ring, i);
2433        i -= tx_ring->count;
2434
2435        do {
2436                struct ice_tx_desc *eop_desc = tx_buf->next_to_watch;
2437
2438                /* if next_to_watch is not set then there is no pending work */
2439                if (!eop_desc)
2440                        break;
2441
2442                /* prevent any other reads prior to eop_desc */
2443                smp_rmb();
2444
2445                /* if the descriptor isn't done, no work to do */
2446                if (!(eop_desc->cmd_type_offset_bsz &
2447                      cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE)))
2448                        break;
2449
2450                /* clear next_to_watch to prevent false hangs */
2451                tx_buf->next_to_watch = NULL;
2452                tx_desc->buf_addr = 0;
2453                tx_desc->cmd_type_offset_bsz = 0;
2454
2455                /* move past filter desc */
2456                tx_buf++;
2457                tx_desc++;
2458                i++;
2459                if (unlikely(!i)) {
2460                        i -= tx_ring->count;
2461                        tx_buf = tx_ring->tx_buf;
2462                        tx_desc = ICE_TX_DESC(tx_ring, 0);
2463                }
2464
2465                /* unmap the data header */
2466                if (dma_unmap_len(tx_buf, len))
2467                        dma_unmap_single(tx_ring->dev,
2468                                         dma_unmap_addr(tx_buf, dma),
2469                                         dma_unmap_len(tx_buf, len),
2470                                         DMA_TO_DEVICE);
2471                if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT)
2472                        devm_kfree(tx_ring->dev, tx_buf->raw_buf);
2473
2474                /* clear next_to_watch to prevent false hangs */
2475                tx_buf->raw_buf = NULL;
2476                tx_buf->tx_flags = 0;
2477                tx_buf->next_to_watch = NULL;
2478                dma_unmap_len_set(tx_buf, len, 0);
2479                tx_desc->buf_addr = 0;
2480                tx_desc->cmd_type_offset_bsz = 0;
2481
2482                /* move past eop_desc for start of next FD desc */
2483                tx_buf++;
2484                tx_desc++;
2485                i++;
2486                if (unlikely(!i)) {
2487                        i -= tx_ring->count;
2488                        tx_buf = tx_ring->tx_buf;
2489                        tx_desc = ICE_TX_DESC(tx_ring, 0);
2490                }
2491
2492                budget--;
2493        } while (likely(budget));
2494
2495        i += tx_ring->count;
2496        tx_ring->next_to_clean = i;
2497
2498        /* re-enable interrupt if needed */
2499        ice_irq_dynamic_ena(&vsi->back->hw, vsi, vsi->q_vectors[0]);
2500}
2501