LXR linux/drivers/net/ethernet/sfc/tx.c

   1/****************************************************************************
   2 * Driver for Solarflare Solarstorm network controllers and boards
   3 * Copyright 2005-2006 Fen Systems Ltd.
   4 * Copyright 2005-2010 Solarflare Communications Inc.
   5 *
   6 * This program is free software; you can redistribute it and/or modify it
   7 * under the terms of the GNU General Public License version 2 as published
   8 * by the Free Software Foundation, incorporated herein by reference.
   9 */
  10
  11#include <linux/pci.h>
  12#include <linux/tcp.h>
  13#include <linux/ip.h>
  14#include <linux/in.h>
  15#include <linux/ipv6.h>
  16#include <linux/slab.h>
  17#include <net/ipv6.h>
  18#include <linux/if_ether.h>
  19#include <linux/highmem.h>
  20#include "net_driver.h"
  21#include "efx.h"
  22#include "nic.h"
  23#include "workarounds.h"
  24
  25static void efx_dequeue_buffer(struct efx_tx_queue *tx_queue,
  26                               struct efx_tx_buffer *buffer,
  27                               unsigned int *pkts_compl,
  28                               unsigned int *bytes_compl)
  29{
  30        if (buffer->unmap_len) {
  31                struct device *dma_dev = &tx_queue->efx->pci_dev->dev;
  32                dma_addr_t unmap_addr = (buffer->dma_addr + buffer->len -
  33                                         buffer->unmap_len);
  34                if (buffer->flags & EFX_TX_BUF_MAP_SINGLE)
  35                        dma_unmap_single(dma_dev, unmap_addr, buffer->unmap_len,
  36                                         DMA_TO_DEVICE);
  37                else
  38                        dma_unmap_page(dma_dev, unmap_addr, buffer->unmap_len,
  39                                       DMA_TO_DEVICE);
  40                buffer->unmap_len = 0;
  41        }
  42
  43        if (buffer->flags & EFX_TX_BUF_SKB) {
  44                (*pkts_compl)++;
  45                (*bytes_compl) += buffer->skb->len;
  46                dev_kfree_skb_any((struct sk_buff *) buffer->skb);
  47                netif_vdbg(tx_queue->efx, tx_done, tx_queue->efx->net_dev,
  48                           "TX queue %d transmission id %x complete\n",
  49                           tx_queue->queue, tx_queue->read_count);
  50        } else if (buffer->flags & EFX_TX_BUF_HEAP) {
  51                kfree(buffer->heap_buf);
  52        }
  53
  54        buffer->len = 0;
  55        buffer->flags = 0;
  56}
  57
  58static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
  59                               struct sk_buff *skb);
  60
  61static inline unsigned
  62efx_max_tx_len(struct efx_nic *efx, dma_addr_t dma_addr)
  63{
  64        /* Depending on the NIC revision, we can use descriptor
  65         * lengths up to 8K or 8K-1.  However, since PCI Express
  66         * devices must split read requests at 4K boundaries, there is
  67         * little benefit from using descriptors that cross those
  68         * boundaries and we keep things simple by not doing so.
  69         */
  70        unsigned len = (~dma_addr & (EFX_PAGE_SIZE - 1)) + 1;
  71
  72        /* Work around hardware bug for unaligned buffers. */
  73        if (EFX_WORKAROUND_5391(efx) && (dma_addr & 0xf))
  74                len = min_t(unsigned, len, 512 - (dma_addr & 0xf));
  75
  76        return len;
  77}
  78
  79unsigned int efx_tx_max_skb_descs(struct efx_nic *efx)
  80{
  81        /* Header and payload descriptor for each output segment, plus
  82         * one for every input fragment boundary within a segment
  83         */
  84        unsigned int max_descs = EFX_TSO_MAX_SEGS * 2 + MAX_SKB_FRAGS;
  85
  86        /* Possibly one more per segment for the alignment workaround */
  87        if (EFX_WORKAROUND_5391(efx))
  88                max_descs += EFX_TSO_MAX_SEGS;
  89
  90        /* Possibly more for PCIe page boundaries within input fragments */
  91        if (PAGE_SIZE > EFX_PAGE_SIZE)
  92                max_descs += max_t(unsigned int, MAX_SKB_FRAGS,
  93                                   DIV_ROUND_UP(GSO_MAX_SIZE, EFX_PAGE_SIZE));
  94
  95        return max_descs;
  96}
  97
  98/* Get partner of a TX queue, seen as part of the same net core queue */
  99static struct efx_tx_queue *efx_tx_queue_partner(struct efx_tx_queue *tx_queue)
 100{
 101        if (tx_queue->queue & EFX_TXQ_TYPE_OFFLOAD)
 102                return tx_queue - EFX_TXQ_TYPE_OFFLOAD;
 103        else
 104                return tx_queue + EFX_TXQ_TYPE_OFFLOAD;
 105}
 106
 107static void efx_tx_maybe_stop_queue(struct efx_tx_queue *txq1)
 108{
 109        /* We need to consider both queues that the net core sees as one */
 110        struct efx_tx_queue *txq2 = efx_tx_queue_partner(txq1);
 111        struct efx_nic *efx = txq1->efx;
 112        unsigned int fill_level;
 113
 114        fill_level = max(txq1->insert_count - txq1->old_read_count,
 115                         txq2->insert_count - txq2->old_read_count);
 116        if (likely(fill_level < efx->txq_stop_thresh))
 117                return;
 118
 119        /* We used the stale old_read_count above, which gives us a
 120         * pessimistic estimate of the fill level (which may even
 121         * validly be >= efx->txq_entries).  Now try again using
 122         * read_count (more likely to be a cache miss).
 123         *
 124         * If we read read_count and then conditionally stop the
 125         * queue, it is possible for the completion path to race with
 126         * us and complete all outstanding descriptors in the middle,
 127         * after which there will be no more completions to wake it.
 128         * Therefore we stop the queue first, then read read_count
 129         * (with a memory barrier to ensure the ordering), then
 130         * restart the queue if the fill level turns out to be low
 131         * enough.
 132         */
 133        netif_tx_stop_queue(txq1->core_txq);
 134        smp_mb();
 135        txq1->old_read_count = ACCESS_ONCE(txq1->read_count);
 136        txq2->old_read_count = ACCESS_ONCE(txq2->read_count);
 137
 138        fill_level = max(txq1->insert_count - txq1->old_read_count,
 139                         txq2->insert_count - txq2->old_read_count);
 140        EFX_BUG_ON_PARANOID(fill_level >= efx->txq_entries);
 141        if (likely(fill_level < efx->txq_stop_thresh)) {
 142                smp_mb();
 143                if (likely(!efx->loopback_selftest))
 144                        netif_tx_start_queue(txq1->core_txq);
 145        }
 146}
 147
 148/*
 149 * Add a socket buffer to a TX queue
 150 *
 151 * This maps all fragments of a socket buffer for DMA and adds them to
 152 * the TX queue.  The queue's insert pointer will be incremented by
 153 * the number of fragments in the socket buffer.
 154 *
 155 * If any DMA mapping fails, any mapped fragments will be unmapped,
 156 * the queue's insert pointer will be restored to its original value.
 157 *
 158 * This function is split out from efx_hard_start_xmit to allow the
 159 * loopback test to direct packets via specific TX queues.
 160 *
 161 * Returns NETDEV_TX_OK.
 162 * You must hold netif_tx_lock() to call this function.
 163 */
 164netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 165{
 166        struct efx_nic *efx = tx_queue->efx;
 167        struct device *dma_dev = &efx->pci_dev->dev;
 168        struct efx_tx_buffer *buffer;
 169        skb_frag_t *fragment;
 170        unsigned int len, unmap_len = 0, insert_ptr;
 171        dma_addr_t dma_addr, unmap_addr = 0;
 172        unsigned int dma_len;
 173        unsigned short dma_flags;
 174        int i = 0;
 175
 176        EFX_BUG_ON_PARANOID(tx_queue->write_count != tx_queue->insert_count);
 177
 178        if (skb_shinfo(skb)->gso_size)
 179                return efx_enqueue_skb_tso(tx_queue, skb);
 180
 181        /* Get size of the initial fragment */
 182        len = skb_headlen(skb);
 183
 184        /* Pad if necessary */
 185        if (EFX_WORKAROUND_15592(efx) && skb->len <= 32) {
 186                EFX_BUG_ON_PARANOID(skb->data_len);
 187                len = 32 + 1;
 188                if (skb_pad(skb, len - skb->len))
 189                        return NETDEV_TX_OK;
 190        }
 191
 192        /* Map for DMA.  Use dma_map_single rather than dma_map_page
 193         * since this is more efficient on machines with sparse
 194         * memory.
 195         */
 196        dma_flags = EFX_TX_BUF_MAP_SINGLE;
 197        dma_addr = dma_map_single(dma_dev, skb->data, len, PCI_DMA_TODEVICE);
 198
 199        /* Process all fragments */
 200        while (1) {
 201                if (unlikely(dma_mapping_error(dma_dev, dma_addr)))
 202                        goto dma_err;
 203
 204                /* Store fields for marking in the per-fragment final
 205                 * descriptor */
 206                unmap_len = len;
 207                unmap_addr = dma_addr;
 208
 209                /* Add to TX queue, splitting across DMA boundaries */
 210                do {
 211                        insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask;
 212                        buffer = &tx_queue->buffer[insert_ptr];
 213                        EFX_BUG_ON_PARANOID(buffer->flags);
 214                        EFX_BUG_ON_PARANOID(buffer->len);
 215                        EFX_BUG_ON_PARANOID(buffer->unmap_len);
 216
 217                        dma_len = efx_max_tx_len(efx, dma_addr);
 218                        if (likely(dma_len >= len))
 219                                dma_len = len;
 220
 221                        /* Fill out per descriptor fields */
 222                        buffer->len = dma_len;
 223                        buffer->dma_addr = dma_addr;
 224                        buffer->flags = EFX_TX_BUF_CONT;
 225                        len -= dma_len;
 226                        dma_addr += dma_len;
 227                        ++tx_queue->insert_count;
 228                } while (len);
 229
 230                /* Transfer ownership of the unmapping to the final buffer */
 231                buffer->flags = EFX_TX_BUF_CONT | dma_flags;
 232                buffer->unmap_len = unmap_len;
 233                unmap_len = 0;
 234
 235                /* Get address and size of next fragment */
 236                if (i >= skb_shinfo(skb)->nr_frags)
 237                        break;
 238                fragment = &skb_shinfo(skb)->frags[i];
 239                len = skb_frag_size(fragment);
 240                i++;
 241                /* Map for DMA */
 242                dma_flags = 0;
 243                dma_addr = skb_frag_dma_map(dma_dev, fragment, 0, len,
 244                                            DMA_TO_DEVICE);
 245        }
 246
 247        /* Transfer ownership of the skb to the final buffer */
 248        buffer->skb = skb;
 249        buffer->flags = EFX_TX_BUF_SKB | dma_flags;
 250
 251        netdev_tx_sent_queue(tx_queue->core_txq, skb->len);
 252
 253        /* Pass off to hardware */
 254        efx_nic_push_buffers(tx_queue);
 255
 256        efx_tx_maybe_stop_queue(tx_queue);
 257
 258        return NETDEV_TX_OK;
 259
 260 dma_err:
 261        netif_err(efx, tx_err, efx->net_dev,
 262                  " TX queue %d could not map skb with %d bytes %d "
 263                  "fragments for DMA\n", tx_queue->queue, skb->len,
 264                  skb_shinfo(skb)->nr_frags + 1);
 265
 266        /* Mark the packet as transmitted, and free the SKB ourselves */
 267        dev_kfree_skb_any(skb);
 268
 269        /* Work backwards until we hit the original insert pointer value */
 270        while (tx_queue->insert_count != tx_queue->write_count) {
 271                unsigned int pkts_compl = 0, bytes_compl = 0;
 272                --tx_queue->insert_count;
 273                insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask;
 274                buffer = &tx_queue->buffer[insert_ptr];
 275                efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl);
 276        }
 277
 278        /* Free the fragment we were mid-way through pushing */
 279        if (unmap_len) {
 280                if (dma_flags & EFX_TX_BUF_MAP_SINGLE)
 281                        dma_unmap_single(dma_dev, unmap_addr, unmap_len,
 282                                         DMA_TO_DEVICE);
 283                else
 284                        dma_unmap_page(dma_dev, unmap_addr, unmap_len,
 285                                       DMA_TO_DEVICE);
 286        }
 287
 288        return NETDEV_TX_OK;
 289}
 290
 291/* Remove packets from the TX queue
 292 *
 293 * This removes packets from the TX queue, up to and including the
 294 * specified index.
 295 */
 296static void efx_dequeue_buffers(struct efx_tx_queue *tx_queue,
 297                                unsigned int index,
 298                                unsigned int *pkts_compl,
 299                                unsigned int *bytes_compl)
 300{
 301        struct efx_nic *efx = tx_queue->efx;
 302        unsigned int stop_index, read_ptr;
 303
 304        stop_index = (index + 1) & tx_queue->ptr_mask;
 305        read_ptr = tx_queue->read_count & tx_queue->ptr_mask;
 306
 307        while (read_ptr != stop_index) {
 308                struct efx_tx_buffer *buffer = &tx_queue->buffer[read_ptr];
 309                if (unlikely(buffer->len == 0)) {
 310                        netif_err(efx, tx_err, efx->net_dev,
 311                                  "TX queue %d spurious TX completion id %x\n",
 312                                  tx_queue->queue, read_ptr);
 313                        efx_schedule_reset(efx, RESET_TYPE_TX_SKIP);
 314                        return;
 315                }
 316
 317                efx_dequeue_buffer(tx_queue, buffer, pkts_compl, bytes_compl);
 318
 319                ++tx_queue->read_count;
 320                read_ptr = tx_queue->read_count & tx_queue->ptr_mask;
 321        }
 322}
 323
 324/* Initiate a packet transmission.  We use one channel per CPU
 325 * (sharing when we have more CPUs than channels).  On Falcon, the TX
 326 * completion events will be directed back to the CPU that transmitted
 327 * the packet, which should be cache-efficient.
 328 *
 329 * Context: non-blocking.
 330 * Note that returning anything other than NETDEV_TX_OK will cause the
 331 * OS to free the skb.
 332 */
 333netdev_tx_t efx_hard_start_xmit(struct sk_buff *skb,
 334                                struct net_device *net_dev)
 335{
 336        struct efx_nic *efx = netdev_priv(net_dev);
 337        struct efx_tx_queue *tx_queue;
 338        unsigned index, type;
 339
 340        EFX_WARN_ON_PARANOID(!netif_device_present(net_dev));
 341
 342        /* PTP "event" packet */
 343        if (unlikely(efx_xmit_with_hwtstamp(skb)) &&
 344            unlikely(efx_ptp_is_ptp_tx(efx, skb))) {
 345                return efx_ptp_tx(efx, skb);
 346        }
 347
 348        index = skb_get_queue_mapping(skb);
 349        type = skb->ip_summed == CHECKSUM_PARTIAL ? EFX_TXQ_TYPE_OFFLOAD : 0;
 350        if (index >= efx->n_tx_channels) {
 351                index -= efx->n_tx_channels;
 352                type |= EFX_TXQ_TYPE_HIGHPRI;
 353        }
 354        tx_queue = efx_get_tx_queue(efx, index, type);
 355
 356        return efx_enqueue_skb(tx_queue, skb);
 357}
 358
 359void efx_init_tx_queue_core_txq(struct efx_tx_queue *tx_queue)
 360{
 361        struct efx_nic *efx = tx_queue->efx;
 362
 363        /* Must be inverse of queue lookup in efx_hard_start_xmit() */
 364        tx_queue->core_txq =
 365                netdev_get_tx_queue(efx->net_dev,
 366                                    tx_queue->queue / EFX_TXQ_TYPES +
 367                                    ((tx_queue->queue & EFX_TXQ_TYPE_HIGHPRI) ?
 368                                     efx->n_tx_channels : 0));
 369}
 370
 371int efx_setup_tc(struct net_device *net_dev, u8 num_tc)
 372{
 373        struct efx_nic *efx = netdev_priv(net_dev);
 374        struct efx_channel *channel;
 375        struct efx_tx_queue *tx_queue;
 376        unsigned tc;
 377        int rc;
 378
 379        if (efx_nic_rev(efx) < EFX_REV_FALCON_B0 || num_tc > EFX_MAX_TX_TC)
 380                return -EINVAL;
 381
 382        if (num_tc == net_dev->num_tc)
 383                return 0;
 384
 385        for (tc = 0; tc < num_tc; tc++) {
 386                net_dev->tc_to_txq[tc].offset = tc * efx->n_tx_channels;
 387                net_dev->tc_to_txq[tc].count = efx->n_tx_channels;
 388        }
 389
 390        if (num_tc > net_dev->num_tc) {
 391                /* Initialise high-priority queues as necessary */
 392                efx_for_each_channel(channel, efx) {
 393                        efx_for_each_possible_channel_tx_queue(tx_queue,
 394                                                               channel) {
 395                                if (!(tx_queue->queue & EFX_TXQ_TYPE_HIGHPRI))
 396                                        continue;
 397                                if (!tx_queue->buffer) {
 398                                        rc = efx_probe_tx_queue(tx_queue);
 399                                        if (rc)
 400                                                return rc;
 401                                }
 402                                if (!tx_queue->initialised)
 403                                        efx_init_tx_queue(tx_queue);
 404                                efx_init_tx_queue_core_txq(tx_queue);
 405                        }
 406                }
 407        } else {
 408                /* Reduce number of classes before number of queues */
 409                net_dev->num_tc = num_tc;
 410        }
 411
 412        rc = netif_set_real_num_tx_queues(net_dev,
 413                                          max_t(int, num_tc, 1) *
 414                                          efx->n_tx_channels);
 415        if (rc)
 416                return rc;
 417
 418        /* Do not destroy high-priority queues when they become
 419         * unused.  We would have to flush them first, and it is
 420         * fairly difficult to flush a subset of TX queues.  Leave
 421         * it to efx_fini_channels().
 422         */
 423
 424        net_dev->num_tc = num_tc;
 425        return 0;
 426}
 427
 428void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index)
 429{
 430        unsigned fill_level;
 431        struct efx_nic *efx = tx_queue->efx;
 432        struct efx_tx_queue *txq2;
 433        unsigned int pkts_compl = 0, bytes_compl = 0;
 434
 435        EFX_BUG_ON_PARANOID(index > tx_queue->ptr_mask);
 436
 437        efx_dequeue_buffers(tx_queue, index, &pkts_compl, &bytes_compl);
 438        netdev_tx_completed_queue(tx_queue->core_txq, pkts_compl, bytes_compl);
 439
 440        /* See if we need to restart the netif queue.  This memory
 441         * barrier ensures that we write read_count (inside
 442         * efx_dequeue_buffers()) before reading the queue status.
 443         */
 444        smp_mb();
 445        if (unlikely(netif_tx_queue_stopped(tx_queue->core_txq)) &&
 446            likely(efx->port_enabled) &&
 447            likely(netif_device_present(efx->net_dev))) {
 448                txq2 = efx_tx_queue_partner(tx_queue);
 449                fill_level = max(tx_queue->insert_count - tx_queue->read_count,
 450                                 txq2->insert_count - txq2->read_count);
 451                if (fill_level <= efx->txq_wake_thresh)
 452                        netif_tx_wake_queue(tx_queue->core_txq);
 453        }
 454
 455        /* Check whether the hardware queue is now empty */
 456        if ((int)(tx_queue->read_count - tx_queue->old_write_count) >= 0) {
 457                tx_queue->old_write_count = ACCESS_ONCE(tx_queue->write_count);
 458                if (tx_queue->read_count == tx_queue->old_write_count) {
 459                        smp_mb();
 460                        tx_queue->empty_read_count =
 461                                tx_queue->read_count | EFX_EMPTY_COUNT_VALID;
 462                }
 463        }
 464}
 465
 466/* Size of page-based TSO header buffers.  Larger blocks must be
 467 * allocated from the heap.
 468 */
 469#define TSOH_STD_SIZE   128
 470#define TSOH_PER_PAGE   (PAGE_SIZE / TSOH_STD_SIZE)
 471
 472/* At most half the descriptors in the queue at any time will refer to
 473 * a TSO header buffer, since they must always be followed by a
 474 * payload descriptor referring to an skb.
 475 */
 476static unsigned int efx_tsoh_page_count(struct efx_tx_queue *tx_queue)
 477{
 478        return DIV_ROUND_UP(tx_queue->ptr_mask + 1, 2 * TSOH_PER_PAGE);
 479}
 480
 481int efx_probe_tx_queue(struct efx_tx_queue *tx_queue)
 482{
 483        struct efx_nic *efx = tx_queue->efx;
 484        unsigned int entries;
 485        int rc;
 486
 487        /* Create the smallest power-of-two aligned ring */
 488        entries = max(roundup_pow_of_two(efx->txq_entries), EFX_MIN_DMAQ_SIZE);
 489        EFX_BUG_ON_PARANOID(entries > EFX_MAX_DMAQ_SIZE);
 490        tx_queue->ptr_mask = entries - 1;
 491
 492        netif_dbg(efx, probe, efx->net_dev,
 493                  "creating TX queue %d size %#x mask %#x\n",
 494                  tx_queue->queue, efx->txq_entries, tx_queue->ptr_mask);
 495
 496        /* Allocate software ring */
 497        tx_queue->buffer = kcalloc(entries, sizeof(*tx_queue->buffer),
 498                                   GFP_KERNEL);
 499        if (!tx_queue->buffer)
 500                return -ENOMEM;
 501
 502        if (tx_queue->queue & EFX_TXQ_TYPE_OFFLOAD) {
 503                tx_queue->tsoh_page =
 504                        kcalloc(efx_tsoh_page_count(tx_queue),
 505                                sizeof(tx_queue->tsoh_page[0]), GFP_KERNEL);
 506                if (!tx_queue->tsoh_page) {
 507                        rc = -ENOMEM;
 508                        goto fail1;
 509                }
 510        }
 511
 512        /* Allocate hardware ring */
 513        rc = efx_nic_probe_tx(tx_queue);
 514        if (rc)
 515                goto fail2;
 516
 517        return 0;
 518
 519fail2:
 520        kfree(tx_queue->tsoh_page);
 521        tx_queue->tsoh_page = NULL;
 522fail1:
 523        kfree(tx_queue->buffer);
 524        tx_queue->buffer = NULL;
 525        return rc;
 526}
 527
 528void efx_init_tx_queue(struct efx_tx_queue *tx_queue)
 529{
 530        netif_dbg(tx_queue->efx, drv, tx_queue->efx->net_dev,
 531                  "initialising TX queue %d\n", tx_queue->queue);
 532
 533        tx_queue->insert_count = 0;
 534        tx_queue->write_count = 0;
 535        tx_queue->old_write_count = 0;
 536        tx_queue->read_count = 0;
 537        tx_queue->old_read_count = 0;
 538        tx_queue->empty_read_count = 0 | EFX_EMPTY_COUNT_VALID;
 539
 540        /* Set up TX descriptor ring */
 541        efx_nic_init_tx(tx_queue);
 542
 543        tx_queue->initialised = true;
 544}
 545
 546void efx_release_tx_buffers(struct efx_tx_queue *tx_queue)
 547{
 548        struct efx_tx_buffer *buffer;
 549
 550        if (!tx_queue->buffer)
 551                return;
 552
 553        /* Free any buffers left in the ring */
 554        while (tx_queue->read_count != tx_queue->write_count) {
 555                unsigned int pkts_compl = 0, bytes_compl = 0;
 556                buffer = &tx_queue->buffer[tx_queue->read_count & tx_queue->ptr_mask];
 557                efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl);
 558
 559                ++tx_queue->read_count;
 560        }
 561        netdev_tx_reset_queue(tx_queue->core_txq);
 562}
 563
 564void efx_fini_tx_queue(struct efx_tx_queue *tx_queue)
 565{
 566        if (!tx_queue->initialised)
 567                return;
 568
 569        netif_dbg(tx_queue->efx, drv, tx_queue->efx->net_dev,
 570                  "shutting down TX queue %d\n", tx_queue->queue);
 571
 572        tx_queue->initialised = false;
 573
 574        /* Flush TX queue, remove descriptor ring */
 575        efx_nic_fini_tx(tx_queue);
 576
 577        efx_release_tx_buffers(tx_queue);
 578}
 579
 580void efx_remove_tx_queue(struct efx_tx_queue *tx_queue)
 581{
 582        int i;
 583
 584        if (!tx_queue->buffer)
 585                return;
 586
 587        netif_dbg(tx_queue->efx, drv, tx_queue->efx->net_dev,
 588                  "destroying TX queue %d\n", tx_queue->queue);
 589        efx_nic_remove_tx(tx_queue);
 590
 591        if (tx_queue->tsoh_page) {
 592                for (i = 0; i < efx_tsoh_page_count(tx_queue); i++)
 593                        efx_nic_free_buffer(tx_queue->efx,
 594                                            &tx_queue->tsoh_page[i]);
 595                kfree(tx_queue->tsoh_page);
 596                tx_queue->tsoh_page = NULL;
 597        }
 598
 599        kfree(tx_queue->buffer);
 600        tx_queue->buffer = NULL;
 601}
 602
 603
 604/* Efx TCP segmentation acceleration.
 605 *
 606 * Why?  Because by doing it here in the driver we can go significantly
 607 * faster than the GSO.
 608 *
 609 * Requires TX checksum offload support.
 610 */
 611
 612/* Number of bytes inserted at the start of a TSO header buffer,
 613 * similar to NET_IP_ALIGN.
 614 */
 615#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 616#define TSOH_OFFSET     0
 617#else
 618#define TSOH_OFFSET     NET_IP_ALIGN
 619#endif
 620
 621#define PTR_DIFF(p1, p2)  ((u8 *)(p1) - (u8 *)(p2))
 622
 623/**
 624 * struct tso_state - TSO state for an SKB
 625 * @out_len: Remaining length in current segment
 626 * @seqnum: Current sequence number
 627 * @ipv4_id: Current IPv4 ID, host endian
 628 * @packet_space: Remaining space in current packet
 629 * @dma_addr: DMA address of current position
 630 * @in_len: Remaining length in current SKB fragment
 631 * @unmap_len: Length of SKB fragment
 632 * @unmap_addr: DMA address of SKB fragment
 633 * @dma_flags: TX buffer flags for DMA mapping - %EFX_TX_BUF_MAP_SINGLE or 0
 634 * @protocol: Network protocol (after any VLAN header)
 635 * @ip_off: Offset of IP header
 636 * @tcp_off: Offset of TCP header
 637 * @header_len: Number of bytes of header
 638 * @ip_base_len: IPv4 tot_len or IPv6 payload_len, before TCP payload
 639 *
 640 * The state used during segmentation.  It is put into this data structure
 641 * just to make it easy to pass into inline functions.
 642 */
 643struct tso_state {
 644        /* Output position */
 645        unsigned out_len;
 646        unsigned seqnum;
 647        unsigned ipv4_id;
 648        unsigned packet_space;
 649
 650        /* Input position */
 651        dma_addr_t dma_addr;
 652        unsigned in_len;
 653        unsigned unmap_len;
 654        dma_addr_t unmap_addr;
 655        unsigned short dma_flags;
 656
 657        __be16 protocol;
 658        unsigned int ip_off;
 659        unsigned int tcp_off;
 660        unsigned header_len;
 661        unsigned int ip_base_len;
 662};
 663
 664
 665/*
 666 * Verify that our various assumptions about sk_buffs and the conditions
 667 * under which TSO will be attempted hold true.  Return the protocol number.
 668 */
 669static __be16 efx_tso_check_protocol(struct sk_buff *skb)
 670{
 671        __be16 protocol = skb->protocol;
 672
 673        EFX_BUG_ON_PARANOID(((struct ethhdr *)skb->data)->h_proto !=
 674                            protocol);
 675        if (protocol == htons(ETH_P_8021Q)) {
 676                struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
 677                protocol = veh->h_vlan_encapsulated_proto;
 678        }
 679
 680        if (protocol == htons(ETH_P_IP)) {
 681                EFX_BUG_ON_PARANOID(ip_hdr(skb)->protocol != IPPROTO_TCP);
 682        } else {
 683                EFX_BUG_ON_PARANOID(protocol != htons(ETH_P_IPV6));
 684                EFX_BUG_ON_PARANOID(ipv6_hdr(skb)->nexthdr != NEXTHDR_TCP);
 685        }
 686        EFX_BUG_ON_PARANOID((PTR_DIFF(tcp_hdr(skb), skb->data)
 687                             + (tcp_hdr(skb)->doff << 2u)) >
 688                            skb_headlen(skb));
 689
 690        return protocol;
 691}
 692
 693static u8 *efx_tsoh_get_buffer(struct efx_tx_queue *tx_queue,
 694                               struct efx_tx_buffer *buffer, unsigned int len)
 695{
 696        u8 *result;
 697
 698        EFX_BUG_ON_PARANOID(buffer->len);
 699        EFX_BUG_ON_PARANOID(buffer->flags);
 700        EFX_BUG_ON_PARANOID(buffer->unmap_len);
 701
 702        if (likely(len <= TSOH_STD_SIZE - TSOH_OFFSET)) {
 703                unsigned index =
 704                        (tx_queue->insert_count & tx_queue->ptr_mask) / 2;
 705                struct efx_buffer *page_buf =
 706                        &tx_queue->tsoh_page[index / TSOH_PER_PAGE];
 707                unsigned offset =
 708                        TSOH_STD_SIZE * (index % TSOH_PER_PAGE) + TSOH_OFFSET;
 709
 710                if (unlikely(!page_buf->addr) &&
 711                    efx_nic_alloc_buffer(tx_queue->efx, page_buf, PAGE_SIZE))
 712                        return NULL;
 713
 714                result = (u8 *)page_buf->addr + offset;
 715                buffer->dma_addr = page_buf->dma_addr + offset;
 716                buffer->flags = EFX_TX_BUF_CONT;
 717        } else {
 718                tx_queue->tso_long_headers++;
 719
 720                buffer->heap_buf = kmalloc(TSOH_OFFSET + len, GFP_ATOMIC);
 721                if (unlikely(!buffer->heap_buf))
 722                        return NULL;
 723                result = (u8 *)buffer->heap_buf + TSOH_OFFSET;
 724                buffer->flags = EFX_TX_BUF_CONT | EFX_TX_BUF_HEAP;
 725        }
 726
 727        buffer->len = len;
 728
 729        return result;
 730}
 731
 732/**
 733 * efx_tx_queue_insert - push descriptors onto the TX queue
 734 * @tx_queue:           Efx TX queue
 735 * @dma_addr:           DMA address of fragment
 736 * @len:                Length of fragment
 737 * @final_buffer:       The final buffer inserted into the queue
 738 *
 739 * Push descriptors onto the TX queue.
 740 */
 741static void efx_tx_queue_insert(struct efx_tx_queue *tx_queue,
 742                                dma_addr_t dma_addr, unsigned len,
 743                                struct efx_tx_buffer **final_buffer)
 744{
 745        struct efx_tx_buffer *buffer;
 746        struct efx_nic *efx = tx_queue->efx;
 747        unsigned dma_len, insert_ptr;
 748
 749        EFX_BUG_ON_PARANOID(len <= 0);
 750
 751        while (1) {
 752                insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask;
 753                buffer = &tx_queue->buffer[insert_ptr];
 754                ++tx_queue->insert_count;
 755
 756                EFX_BUG_ON_PARANOID(tx_queue->insert_count -
 757                                    tx_queue->read_count >=
 758                                    efx->txq_entries);
 759
 760                EFX_BUG_ON_PARANOID(buffer->len);
 761                EFX_BUG_ON_PARANOID(buffer->unmap_len);
 762                EFX_BUG_ON_PARANOID(buffer->flags);
 763
 764                buffer->dma_addr = dma_addr;
 765
 766                dma_len = efx_max_tx_len(efx, dma_addr);
 767
 768                /* If there is enough space to send then do so */
 769                if (dma_len >= len)
 770                        break;
 771
 772                buffer->len = dma_len;
 773                buffer->flags = EFX_TX_BUF_CONT;
 774                dma_addr += dma_len;
 775                len -= dma_len;
 776        }
 777
 778        EFX_BUG_ON_PARANOID(!len);
 779        buffer->len = len;
 780        *final_buffer = buffer;
 781}
 782
 783
 784/*
 785 * Put a TSO header into the TX queue.
 786 *
 787 * This is special-cased because we know that it is small enough to fit in
 788 * a single fragment, and we know it doesn't cross a page boundary.  It
 789 * also allows us to not worry about end-of-packet etc.
 790 */
 791static int efx_tso_put_header(struct efx_tx_queue *tx_queue,
 792                              struct efx_tx_buffer *buffer, u8 *header)
 793{
 794        if (unlikely(buffer->flags & EFX_TX_BUF_HEAP)) {
 795                buffer->dma_addr = dma_map_single(&tx_queue->efx->pci_dev->dev,
 796                                                  header, buffer->len,
 797                                                  DMA_TO_DEVICE);
 798                if (unlikely(dma_mapping_error(&tx_queue->efx->pci_dev->dev,
 799                                               buffer->dma_addr))) {
 800                        kfree(buffer->heap_buf);
 801                        buffer->len = 0;
 802                        buffer->flags = 0;
 803                        return -ENOMEM;
 804                }
 805                buffer->unmap_len = buffer->len;
 806                buffer->flags |= EFX_TX_BUF_MAP_SINGLE;
 807        }
 808
 809        ++tx_queue->insert_count;
 810        return 0;
 811}
 812
 813
 814/* Remove buffers put into a tx_queue.  None of the buffers must have
 815 * an skb attached.
 816 */
 817static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue)
 818{
 819        struct efx_tx_buffer *buffer;
 820
 821        /* Work backwards until we hit the original insert pointer value */
 822        while (tx_queue->insert_count != tx_queue->write_count) {
 823                --tx_queue->insert_count;
 824                buffer = &tx_queue->buffer[tx_queue->insert_count &
 825                                           tx_queue->ptr_mask];
 826                efx_dequeue_buffer(tx_queue, buffer, NULL, NULL);
 827        }
 828}
 829
 830
 831/* Parse the SKB header and initialise state. */
 832static void tso_start(struct tso_state *st, const struct sk_buff *skb)
 833{
 834        st->ip_off = skb_network_header(skb) - skb->data;
 835        st->tcp_off = skb_transport_header(skb) - skb->data;
 836        st->header_len = st->tcp_off + (tcp_hdr(skb)->doff << 2u);
 837        if (st->protocol == htons(ETH_P_IP)) {
 838                st->ip_base_len = st->header_len - st->ip_off;
 839                st->ipv4_id = ntohs(ip_hdr(skb)->id);
 840        } else {
 841                st->ip_base_len = st->header_len - st->tcp_off;
 842                st->ipv4_id = 0;
 843        }
 844        st->seqnum = ntohl(tcp_hdr(skb)->seq);
 845
 846        EFX_BUG_ON_PARANOID(tcp_hdr(skb)->urg);
 847        EFX_BUG_ON_PARANOID(tcp_hdr(skb)->syn);
 848        EFX_BUG_ON_PARANOID(tcp_hdr(skb)->rst);
 849
 850        st->out_len = skb->len - st->header_len;
 851        st->unmap_len = 0;
 852        st->dma_flags = 0;
 853}
 854
 855static int tso_get_fragment(struct tso_state *st, struct efx_nic *efx,
 856                            skb_frag_t *frag)
 857{
 858        st->unmap_addr = skb_frag_dma_map(&efx->pci_dev->dev, frag, 0,
 859                                          skb_frag_size(frag), DMA_TO_DEVICE);
 860        if (likely(!dma_mapping_error(&efx->pci_dev->dev, st->unmap_addr))) {
 861                st->dma_flags = 0;
 862                st->unmap_len = skb_frag_size(frag);
 863                st->in_len = skb_frag_size(frag);
 864                st->dma_addr = st->unmap_addr;
 865                return 0;
 866        }
 867        return -ENOMEM;
 868}
 869
 870static int tso_get_head_fragment(struct tso_state *st, struct efx_nic *efx,
 871                                 const struct sk_buff *skb)
 872{
 873        int hl = st->header_len;
 874        int len = skb_headlen(skb) - hl;
 875
 876        st->unmap_addr = dma_map_single(&efx->pci_dev->dev, skb->data + hl,
 877                                        len, DMA_TO_DEVICE);
 878        if (likely(!dma_mapping_error(&efx->pci_dev->dev, st->unmap_addr))) {
 879                st->dma_flags = EFX_TX_BUF_MAP_SINGLE;
 880                st->unmap_len = len;
 881                st->in_len = len;
 882                st->dma_addr = st->unmap_addr;
 883                return 0;
 884        }
 885        return -ENOMEM;
 886}
 887
 888
 889/**
 890 * tso_fill_packet_with_fragment - form descriptors for the current fragment
 891 * @tx_queue:           Efx TX queue
 892 * @skb:                Socket buffer
 893 * @st:                 TSO state
 894 *
 895 * Form descriptors for the current fragment, until we reach the end
 896 * of fragment or end-of-packet.
 897 */
 898static void tso_fill_packet_with_fragment(struct efx_tx_queue *tx_queue,
 899                                          const struct sk_buff *skb,
 900                                          struct tso_state *st)
 901{
 902        struct efx_tx_buffer *buffer;
 903        int n;
 904
 905        if (st->in_len == 0)
 906                return;
 907        if (st->packet_space == 0)
 908                return;
 909
 910        EFX_BUG_ON_PARANOID(st->in_len <= 0);
 911        EFX_BUG_ON_PARANOID(st->packet_space <= 0);
 912
 913        n = min(st->in_len, st->packet_space);
 914
 915        st->packet_space -= n;
 916        st->out_len -= n;
 917        st->in_len -= n;
 918
 919        efx_tx_queue_insert(tx_queue, st->dma_addr, n, &buffer);
 920
 921        if (st->out_len == 0) {
 922                /* Transfer ownership of the skb */
 923                buffer->skb = skb;
 924                buffer->flags = EFX_TX_BUF_SKB;
 925        } else if (st->packet_space != 0) {
 926                buffer->flags = EFX_TX_BUF_CONT;
 927        }
 928
 929        if (st->in_len == 0) {
 930                /* Transfer ownership of the DMA mapping */
 931                buffer->unmap_len = st->unmap_len;
 932                buffer->flags |= st->dma_flags;
 933                st->unmap_len = 0;
 934        }
 935
 936        st->dma_addr += n;
 937}
 938
 939
 940/**
 941 * tso_start_new_packet - generate a new header and prepare for the new packet
 942 * @tx_queue:           Efx TX queue
 943 * @skb:                Socket buffer
 944 * @st:                 TSO state
 945 *
 946 * Generate a new header and prepare for the new packet.  Return 0 on
 947 * success, or -%ENOMEM if failed to alloc header.
 948 */
 949static int tso_start_new_packet(struct efx_tx_queue *tx_queue,
 950                                const struct sk_buff *skb,
 951                                struct tso_state *st)
 952{
 953        struct efx_tx_buffer *buffer =
 954                &tx_queue->buffer[tx_queue->insert_count & tx_queue->ptr_mask];
 955        struct tcphdr *tsoh_th;
 956        unsigned ip_length;
 957        u8 *header;
 958        int rc;
 959
 960        /* Allocate and insert a DMA-mapped header buffer. */
 961        header = efx_tsoh_get_buffer(tx_queue, buffer, st->header_len);
 962        if (!header)
 963                return -ENOMEM;
 964
 965        tsoh_th = (struct tcphdr *)(header + st->tcp_off);
 966
 967        /* Copy and update the headers. */
 968        memcpy(header, skb->data, st->header_len);
 969
 970        tsoh_th->seq = htonl(st->seqnum);
 971        st->seqnum += skb_shinfo(skb)->gso_size;
 972        if (st->out_len > skb_shinfo(skb)->gso_size) {
 973                /* This packet will not finish the TSO burst. */
 974                st->packet_space = skb_shinfo(skb)->gso_size;
 975                tsoh_th->fin = 0;
 976                tsoh_th->psh = 0;
 977        } else {
 978                /* This packet will be the last in the TSO burst. */
 979                st->packet_space = st->out_len;
 980                tsoh_th->fin = tcp_hdr(skb)->fin;
 981                tsoh_th->psh = tcp_hdr(skb)->psh;
 982        }
 983        ip_length = st->ip_base_len + st->packet_space;
 984
 985        if (st->protocol == htons(ETH_P_IP)) {
 986                struct iphdr *tsoh_iph = (struct iphdr *)(header + st->ip_off);
 987
 988                tsoh_iph->tot_len = htons(ip_length);
 989
 990                /* Linux leaves suitable gaps in the IP ID space for us to fill. */
 991                tsoh_iph->id = htons(st->ipv4_id);
 992                st->ipv4_id++;
 993        } else {
 994                struct ipv6hdr *tsoh_iph =
 995                        (struct ipv6hdr *)(header + st->ip_off);
 996
 997                tsoh_iph->payload_len = htons(ip_length);
 998        }
 999
1000        rc = efx_tso_put_header(tx_queue, buffer, header);

1001        if (unlikely(rc))
1002                return rc;
1003
1004        ++tx_queue->tso_packets;
1005
1006        return 0;
1007}
1008
1009
1010/**
1011 * efx_enqueue_skb_tso - segment and transmit a TSO socket buffer
1012 * @tx_queue:           Efx TX queue
1013 * @skb:                Socket buffer
1014 *
1015 * Context: You must hold netif_tx_lock() to call this function.
1016 *
1017 * Add socket buffer @skb to @tx_queue, doing TSO or return != 0 if
1018 * @skb was not enqueued.  In all cases @skb is consumed.  Return
1019 * %NETDEV_TX_OK.
1020 */
1021static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
1022                               struct sk_buff *skb)
1023{
1024        struct efx_nic *efx = tx_queue->efx;
1025        int frag_i, rc;
1026        struct tso_state state;
1027
1028        /* Find the packet protocol and sanity-check it */
1029        state.protocol = efx_tso_check_protocol(skb);
1030
1031        EFX_BUG_ON_PARANOID(tx_queue->write_count != tx_queue->insert_count);
1032
1033        tso_start(&state, skb);
1034
1035        /* Assume that skb header area contains exactly the headers, and
1036         * all payload is in the frag list.
1037         */
1038        if (skb_headlen(skb) == state.header_len) {
1039                /* Grab the first payload fragment. */
1040                EFX_BUG_ON_PARANOID(skb_shinfo(skb)->nr_frags < 1);
1041                frag_i = 0;
1042                rc = tso_get_fragment(&state, efx,
1043                                      skb_shinfo(skb)->frags + frag_i);
1044                if (rc)
1045                        goto mem_err;
1046        } else {
1047                rc = tso_get_head_fragment(&state, efx, skb);
1048                if (rc)
1049                        goto mem_err;
1050                frag_i = -1;
1051        }
1052
1053        if (tso_start_new_packet(tx_queue, skb, &state) < 0)
1054                goto mem_err;
1055
1056        while (1) {
1057                tso_fill_packet_with_fragment(tx_queue, skb, &state);
1058
1059                /* Move onto the next fragment? */
1060                if (state.in_len == 0) {
1061                        if (++frag_i >= skb_shinfo(skb)->nr_frags)
1062                                /* End of payload reached. */
1063                                break;
1064                        rc = tso_get_fragment(&state, efx,
1065                                              skb_shinfo(skb)->frags + frag_i);
1066                        if (rc)
1067                                goto mem_err;
1068                }
1069
1070                /* Start at new packet? */
1071                if (state.packet_space == 0 &&
1072                    tso_start_new_packet(tx_queue, skb, &state) < 0)
1073                        goto mem_err;
1074        }
1075
1076        netdev_tx_sent_queue(tx_queue->core_txq, skb->len);
1077
1078        /* Pass off to hardware */
1079        efx_nic_push_buffers(tx_queue);
1080
1081        efx_tx_maybe_stop_queue(tx_queue);
1082
1083        tx_queue->tso_bursts++;
1084        return NETDEV_TX_OK;
1085
1086 mem_err:
1087        netif_err(efx, tx_err, efx->net_dev,
1088                  "Out of memory for TSO headers, or DMA mapping error\n");
1089        dev_kfree_skb_any(skb);
1090
1091        /* Free the DMA mapping we were in the process of writing out */
1092        if (state.unmap_len) {
1093                if (state.dma_flags & EFX_TX_BUF_MAP_SINGLE)
1094                        dma_unmap_single(&efx->pci_dev->dev, state.unmap_addr,
1095                                         state.unmap_len, DMA_TO_DEVICE);
1096                else
1097                        dma_unmap_page(&efx->pci_dev->dev, state.unmap_addr,
1098                                       state.unmap_len, DMA_TO_DEVICE);
1099        }
1100
1101        efx_enqueue_unwind(tx_queue);
1102        return NETDEV_TX_OK;
1103}
1104