LXR linux/net/core/dev.c

   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *      NET3    Protocol independent device support routines.
   4 *
   5 *      Derived from the non IP parts of dev.c 1.0.19
   6 *              Authors:        Ross Biro
   7 *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
   8 *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
   9 *
  10 *      Additional Authors:
  11 *              Florian la Roche <rzsfl@rz.uni-sb.de>
  12 *              Alan Cox <gw4pts@gw4pts.ampr.org>
  13 *              David Hinds <dahinds@users.sourceforge.net>
  14 *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  15 *              Adam Sulmicki <adam@cfar.umd.edu>
  16 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  17 *
  18 *      Changes:
  19 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  20 *                                      to 2 if register_netdev gets called
  21 *                                      before net_dev_init & also removed a
  22 *                                      few lines of code in the process.
  23 *              Alan Cox        :       device private ioctl copies fields back.
  24 *              Alan Cox        :       Transmit queue code does relevant
  25 *                                      stunts to keep the queue safe.
  26 *              Alan Cox        :       Fixed double lock.
  27 *              Alan Cox        :       Fixed promisc NULL pointer trap
  28 *              ????????        :       Support the full private ioctl range
  29 *              Alan Cox        :       Moved ioctl permission check into
  30 *                                      drivers
  31 *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  32 *              Alan Cox        :       100 backlog just doesn't cut it when
  33 *                                      you start doing multicast video 8)
  34 *              Alan Cox        :       Rewrote net_bh and list manager.
  35 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  36 *              Alan Cox        :       Took out transmit every packet pass
  37 *                                      Saved a few bytes in the ioctl handler
  38 *              Alan Cox        :       Network driver sets packet type before
  39 *                                      calling netif_rx. Saves a function
  40 *                                      call a packet.
  41 *              Alan Cox        :       Hashed net_bh()
  42 *              Richard Kooijman:       Timestamp fixes.
  43 *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  44 *              Alan Cox        :       Device lock protection.
  45 *              Alan Cox        :       Fixed nasty side effect of device close
  46 *                                      changes.
  47 *              Rudi Cilibrasi  :       Pass the right thing to
  48 *                                      set_mac_address()
  49 *              Dave Miller     :       32bit quantity for the device lock to
  50 *                                      make it work out on a Sparc.
  51 *              Bjorn Ekwall    :       Added KERNELD hack.
  52 *              Alan Cox        :       Cleaned up the backlog initialise.
  53 *              Craig Metz      :       SIOCGIFCONF fix if space for under
  54 *                                      1 device.
  55 *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  56 *                                      is no device open function.
  57 *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  58 *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  59 *              Cyrus Durgin    :       Cleaned for KMOD
  60 *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  61 *                                      A network device unload needs to purge
  62 *                                      the backlog queue.
  63 *      Paul Rusty Russell      :       SIOCSIFNAME
  64 *              Pekka Riikonen  :       Netdev boot-time settings code
  65 *              Andrew Morton   :       Make unregister_netdevice wait
  66 *                                      indefinitely on dev->refcnt
  67 *              J Hadi Salim    :       - Backlog queue sampling
  68 *                                      - netif_rx() feedback
  69 */
  70
  71#include <linux/uaccess.h>
  72#include <linux/bitops.h>
  73#include <linux/capability.h>
  74#include <linux/cpu.h>
  75#include <linux/types.h>
  76#include <linux/kernel.h>
  77#include <linux/hash.h>
  78#include <linux/slab.h>
  79#include <linux/sched.h>
  80#include <linux/sched/mm.h>
  81#include <linux/mutex.h>
  82#include <linux/rwsem.h>
  83#include <linux/string.h>
  84#include <linux/mm.h>
  85#include <linux/socket.h>
  86#include <linux/sockios.h>
  87#include <linux/errno.h>
  88#include <linux/interrupt.h>
  89#include <linux/if_ether.h>
  90#include <linux/netdevice.h>
  91#include <linux/etherdevice.h>
  92#include <linux/ethtool.h>
  93#include <linux/skbuff.h>
  94#include <linux/bpf.h>
  95#include <linux/bpf_trace.h>
  96#include <net/net_namespace.h>
  97#include <net/sock.h>
  98#include <net/busy_poll.h>
  99#include <linux/rtnetlink.h>
 100#include <linux/stat.h>
 101#include <net/dst.h>
 102#include <net/dst_metadata.h>
 103#include <net/pkt_sched.h>
 104#include <net/pkt_cls.h>
 105#include <net/checksum.h>
 106#include <net/xfrm.h>
 107#include <linux/highmem.h>
 108#include <linux/init.h>
 109#include <linux/module.h>
 110#include <linux/netpoll.h>
 111#include <linux/rcupdate.h>
 112#include <linux/delay.h>
 113#include <net/iw_handler.h>
 114#include <asm/current.h>
 115#include <linux/audit.h>
 116#include <linux/dmaengine.h>
 117#include <linux/err.h>
 118#include <linux/ctype.h>
 119#include <linux/if_arp.h>
 120#include <linux/if_vlan.h>
 121#include <linux/ip.h>
 122#include <net/ip.h>
 123#include <net/mpls.h>
 124#include <linux/ipv6.h>
 125#include <linux/in.h>
 126#include <linux/jhash.h>
 127#include <linux/random.h>
 128#include <trace/events/napi.h>
 129#include <trace/events/net.h>
 130#include <trace/events/skb.h>
 131#include <linux/inetdevice.h>
 132#include <linux/cpu_rmap.h>
 133#include <linux/static_key.h>
 134#include <linux/hashtable.h>
 135#include <linux/vmalloc.h>
 136#include <linux/if_macvlan.h>
 137#include <linux/errqueue.h>
 138#include <linux/hrtimer.h>
 139#include <linux/netfilter_ingress.h>
 140#include <linux/crash_dump.h>
 141#include <linux/sctp.h>
 142#include <net/udp_tunnel.h>
 143#include <linux/net_namespace.h>
 144#include <linux/indirect_call_wrapper.h>
 145#include <net/devlink.h>
 146
 147#include "net-sysfs.h"
 148
 149#define MAX_GRO_SKBS 8
 150
 151/* This should be increased if a protocol with a bigger head is added. */
 152#define GRO_MAX_HEAD (MAX_HEADER + 128)
 153
 154static DEFINE_SPINLOCK(ptype_lock);
 155static DEFINE_SPINLOCK(offload_lock);
 156struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 157struct list_head ptype_all __read_mostly;       /* Taps */
 158static struct list_head offload_base __read_mostly;
 159
 160static int netif_rx_internal(struct sk_buff *skb);
 161static int call_netdevice_notifiers_info(unsigned long val,
 162                                         struct netdev_notifier_info *info);
 163static int call_netdevice_notifiers_extack(unsigned long val,
 164                                           struct net_device *dev,
 165                                           struct netlink_ext_ack *extack);
 166static struct napi_struct *napi_by_id(unsigned int napi_id);
 167
 168/*
 169 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 170 * semaphore.
 171 *
 172 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 173 *
 174 * Writers must hold the rtnl semaphore while they loop through the
 175 * dev_base_head list, and hold dev_base_lock for writing when they do the
 176 * actual updates.  This allows pure readers to access the list even
 177 * while a writer is preparing to update it.
 178 *
 179 * To put it another way, dev_base_lock is held for writing only to
 180 * protect against pure readers; the rtnl semaphore provides the
 181 * protection against other writers.
 182 *
 183 * See, for example usages, register_netdevice() and
 184 * unregister_netdevice(), which must be called with the rtnl
 185 * semaphore held.
 186 */
 187DEFINE_RWLOCK(dev_base_lock);
 188EXPORT_SYMBOL(dev_base_lock);
 189
 190static DEFINE_MUTEX(ifalias_mutex);
 191
 192/* protects napi_hash addition/deletion and napi_gen_id */
 193static DEFINE_SPINLOCK(napi_hash_lock);
 194
 195static unsigned int napi_gen_id = NR_CPUS;
 196static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 197
 198static DECLARE_RWSEM(devnet_rename_sem);
 199
 200static inline void dev_base_seq_inc(struct net *net)
 201{
 202        while (++net->dev_base_seq == 0)
 203                ;
 204}
 205
 206static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 207{
 208        unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 209
 210        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 211}
 212
 213static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 214{
 215        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 216}
 217
 218static inline void rps_lock(struct softnet_data *sd)
 219{
 220#ifdef CONFIG_RPS
 221        spin_lock(&sd->input_pkt_queue.lock);
 222#endif
 223}
 224
 225static inline void rps_unlock(struct softnet_data *sd)
 226{
 227#ifdef CONFIG_RPS
 228        spin_unlock(&sd->input_pkt_queue.lock);
 229#endif
 230}
 231
 232static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
 233                                                       const char *name)
 234{
 235        struct netdev_name_node *name_node;
 236
 237        name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
 238        if (!name_node)
 239                return NULL;
 240        INIT_HLIST_NODE(&name_node->hlist);
 241        name_node->dev = dev;
 242        name_node->name = name;
 243        return name_node;
 244}
 245
 246static struct netdev_name_node *
 247netdev_name_node_head_alloc(struct net_device *dev)
 248{
 249        struct netdev_name_node *name_node;
 250
 251        name_node = netdev_name_node_alloc(dev, dev->name);
 252        if (!name_node)
 253                return NULL;
 254        INIT_LIST_HEAD(&name_node->list);
 255        return name_node;
 256}
 257
 258static void netdev_name_node_free(struct netdev_name_node *name_node)
 259{
 260        kfree(name_node);
 261}
 262
 263static void netdev_name_node_add(struct net *net,
 264                                 struct netdev_name_node *name_node)
 265{
 266        hlist_add_head_rcu(&name_node->hlist,
 267                           dev_name_hash(net, name_node->name));
 268}
 269
 270static void netdev_name_node_del(struct netdev_name_node *name_node)
 271{
 272        hlist_del_rcu(&name_node->hlist);
 273}
 274
 275static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
 276                                                        const char *name)
 277{
 278        struct hlist_head *head = dev_name_hash(net, name);
 279        struct netdev_name_node *name_node;
 280
 281        hlist_for_each_entry(name_node, head, hlist)
 282                if (!strcmp(name_node->name, name))
 283                        return name_node;
 284        return NULL;
 285}
 286
 287static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
 288                                                            const char *name)
 289{
 290        struct hlist_head *head = dev_name_hash(net, name);
 291        struct netdev_name_node *name_node;
 292
 293        hlist_for_each_entry_rcu(name_node, head, hlist)
 294                if (!strcmp(name_node->name, name))
 295                        return name_node;
 296        return NULL;
 297}
 298
 299int netdev_name_node_alt_create(struct net_device *dev, const char *name)
 300{
 301        struct netdev_name_node *name_node;
 302        struct net *net = dev_net(dev);
 303
 304        name_node = netdev_name_node_lookup(net, name);
 305        if (name_node)
 306                return -EEXIST;
 307        name_node = netdev_name_node_alloc(dev, name);
 308        if (!name_node)
 309                return -ENOMEM;
 310        netdev_name_node_add(net, name_node);
 311        /* The node that holds dev->name acts as a head of per-device list. */
 312        list_add_tail(&name_node->list, &dev->name_node->list);
 313
 314        return 0;
 315}
 316EXPORT_SYMBOL(netdev_name_node_alt_create);
 317
 318static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
 319{
 320        list_del(&name_node->list);
 321        netdev_name_node_del(name_node);
 322        kfree(name_node->name);
 323        netdev_name_node_free(name_node);
 324}
 325
 326int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
 327{
 328        struct netdev_name_node *name_node;
 329        struct net *net = dev_net(dev);
 330
 331        name_node = netdev_name_node_lookup(net, name);
 332        if (!name_node)
 333                return -ENOENT;
 334        /* lookup might have found our primary name or a name belonging
 335         * to another device.
 336         */
 337        if (name_node == dev->name_node || name_node->dev != dev)
 338                return -EINVAL;
 339
 340        __netdev_name_node_alt_destroy(name_node);
 341
 342        return 0;
 343}
 344EXPORT_SYMBOL(netdev_name_node_alt_destroy);
 345
 346static void netdev_name_node_alt_flush(struct net_device *dev)
 347{
 348        struct netdev_name_node *name_node, *tmp;
 349
 350        list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
 351                __netdev_name_node_alt_destroy(name_node);
 352}
 353
 354/* Device list insertion */
 355static void list_netdevice(struct net_device *dev)
 356{
 357        struct net *net = dev_net(dev);
 358
 359        ASSERT_RTNL();
 360
 361        write_lock_bh(&dev_base_lock);
 362        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 363        netdev_name_node_add(net, dev->name_node);
 364        hlist_add_head_rcu(&dev->index_hlist,
 365                           dev_index_hash(net, dev->ifindex));
 366        write_unlock_bh(&dev_base_lock);
 367
 368        dev_base_seq_inc(net);
 369}
 370
 371/* Device list removal
 372 * caller must respect a RCU grace period before freeing/reusing dev
 373 */
 374static void unlist_netdevice(struct net_device *dev)
 375{
 376        ASSERT_RTNL();
 377
 378        /* Unlink dev from the device chain */
 379        write_lock_bh(&dev_base_lock);
 380        list_del_rcu(&dev->dev_list);
 381        netdev_name_node_del(dev->name_node);
 382        hlist_del_rcu(&dev->index_hlist);
 383        write_unlock_bh(&dev_base_lock);
 384
 385        dev_base_seq_inc(dev_net(dev));
 386}
 387
 388/*
 389 *      Our notifier list
 390 */
 391
 392static RAW_NOTIFIER_HEAD(netdev_chain);
 393
 394/*
 395 *      Device drivers call our routines to queue packets here. We empty the
 396 *      queue in the local softnet handler.
 397 */
 398
 399DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 400EXPORT_PER_CPU_SYMBOL(softnet_data);
 401
 402#ifdef CONFIG_LOCKDEP
 403/*
 404 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 405 * according to dev->type
 406 */
 407static const unsigned short netdev_lock_type[] = {
 408         ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 409         ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 410         ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 411         ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 412         ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 413         ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 414         ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 415         ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 416         ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 417         ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 418         ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 419         ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 420         ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 421         ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 422         ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 423
 424static const char *const netdev_lock_name[] = {
 425        "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 426        "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 427        "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 428        "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 429        "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 430        "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 431        "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 432        "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 433        "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 434        "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 435        "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 436        "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 437        "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 438        "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 439        "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 440
 441static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 442static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 443
 444static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 445{
 446        int i;
 447
 448        for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 449                if (netdev_lock_type[i] == dev_type)
 450                        return i;
 451        /* the last key is used by default */
 452        return ARRAY_SIZE(netdev_lock_type) - 1;
 453}
 454
 455static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 456                                                 unsigned short dev_type)
 457{
 458        int i;
 459
 460        i = netdev_lock_pos(dev_type);
 461        lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 462                                   netdev_lock_name[i]);
 463}
 464
 465static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 466{
 467        int i;
 468
 469        i = netdev_lock_pos(dev->type);
 470        lockdep_set_class_and_name(&dev->addr_list_lock,
 471                                   &netdev_addr_lock_key[i],
 472                                   netdev_lock_name[i]);
 473}
 474#else
 475static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 476                                                 unsigned short dev_type)
 477{
 478}
 479
 480static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 481{
 482}
 483#endif
 484
 485/*******************************************************************************
 486 *
 487 *              Protocol management and registration routines
 488 *
 489 *******************************************************************************/
 490
 491
 492/*
 493 *      Add a protocol ID to the list. Now that the input handler is
 494 *      smarter we can dispense with all the messy stuff that used to be
 495 *      here.
 496 *
 497 *      BEWARE!!! Protocol handlers, mangling input packets,
 498 *      MUST BE last in hash buckets and checking protocol handlers
 499 *      MUST start from promiscuous ptype_all chain in net_bh.
 500 *      It is true now, do not change it.
 501 *      Explanation follows: if protocol handler, mangling packet, will
 502 *      be the first on list, it is not able to sense, that packet
 503 *      is cloned and should be copied-on-write, so that it will
 504 *      change it and subsequent readers will get broken packet.
 505 *                                                      --ANK (980803)
 506 */
 507
 508static inline struct list_head *ptype_head(const struct packet_type *pt)
 509{
 510        if (pt->type == htons(ETH_P_ALL))
 511                return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 512        else
 513                return pt->dev ? &pt->dev->ptype_specific :
 514                                 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 515}
 516
 517/**
 518 *      dev_add_pack - add packet handler
 519 *      @pt: packet type declaration
 520 *
 521 *      Add a protocol handler to the networking stack. The passed &packet_type
 522 *      is linked into kernel lists and may not be freed until it has been
 523 *      removed from the kernel lists.
 524 *
 525 *      This call does not sleep therefore it can not
 526 *      guarantee all CPU's that are in middle of receiving packets
 527 *      will see the new packet type (until the next received packet).
 528 */
 529
 530void dev_add_pack(struct packet_type *pt)
 531{
 532        struct list_head *head = ptype_head(pt);
 533
 534        spin_lock(&ptype_lock);
 535        list_add_rcu(&pt->list, head);
 536        spin_unlock(&ptype_lock);
 537}
 538EXPORT_SYMBOL(dev_add_pack);
 539
 540/**
 541 *      __dev_remove_pack        - remove packet handler
 542 *      @pt: packet type declaration
 543 *
 544 *      Remove a protocol handler that was previously added to the kernel
 545 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 546 *      from the kernel lists and can be freed or reused once this function
 547 *      returns.
 548 *
 549 *      The packet type might still be in use by receivers
 550 *      and must not be freed until after all the CPU's have gone
 551 *      through a quiescent state.
 552 */
 553void __dev_remove_pack(struct packet_type *pt)
 554{
 555        struct list_head *head = ptype_head(pt);
 556        struct packet_type *pt1;
 557
 558        spin_lock(&ptype_lock);
 559
 560        list_for_each_entry(pt1, head, list) {
 561                if (pt == pt1) {
 562                        list_del_rcu(&pt->list);
 563                        goto out;
 564                }
 565        }
 566
 567        pr_warn("dev_remove_pack: %p not found\n", pt);
 568out:
 569        spin_unlock(&ptype_lock);
 570}
 571EXPORT_SYMBOL(__dev_remove_pack);
 572
 573/**
 574 *      dev_remove_pack  - remove packet handler
 575 *      @pt: packet type declaration
 576 *
 577 *      Remove a protocol handler that was previously added to the kernel
 578 *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 579 *      from the kernel lists and can be freed or reused once this function
 580 *      returns.
 581 *
 582 *      This call sleeps to guarantee that no CPU is looking at the packet
 583 *      type after return.
 584 */
 585void dev_remove_pack(struct packet_type *pt)
 586{
 587        __dev_remove_pack(pt);
 588
 589        synchronize_net();
 590}
 591EXPORT_SYMBOL(dev_remove_pack);
 592
 593
 594/**
 595 *      dev_add_offload - register offload handlers
 596 *      @po: protocol offload declaration
 597 *
 598 *      Add protocol offload handlers to the networking stack. The passed
 599 *      &proto_offload is linked into kernel lists and may not be freed until
 600 *      it has been removed from the kernel lists.
 601 *
 602 *      This call does not sleep therefore it can not
 603 *      guarantee all CPU's that are in middle of receiving packets
 604 *      will see the new offload handlers (until the next received packet).
 605 */
 606void dev_add_offload(struct packet_offload *po)
 607{
 608        struct packet_offload *elem;
 609
 610        spin_lock(&offload_lock);
 611        list_for_each_entry(elem, &offload_base, list) {
 612                if (po->priority < elem->priority)
 613                        break;
 614        }
 615        list_add_rcu(&po->list, elem->list.prev);
 616        spin_unlock(&offload_lock);
 617}
 618EXPORT_SYMBOL(dev_add_offload);
 619
 620/**
 621 *      __dev_remove_offload     - remove offload handler
 622 *      @po: packet offload declaration
 623 *
 624 *      Remove a protocol offload handler that was previously added to the
 625 *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 626 *      is removed from the kernel lists and can be freed or reused once this
 627 *      function returns.
 628 *
 629 *      The packet type might still be in use by receivers
 630 *      and must not be freed until after all the CPU's have gone
 631 *      through a quiescent state.
 632 */
 633static void __dev_remove_offload(struct packet_offload *po)
 634{
 635        struct list_head *head = &offload_base;
 636        struct packet_offload *po1;
 637
 638        spin_lock(&offload_lock);
 639
 640        list_for_each_entry(po1, head, list) {
 641                if (po == po1) {
 642                        list_del_rcu(&po->list);
 643                        goto out;
 644                }
 645        }
 646
 647        pr_warn("dev_remove_offload: %p not found\n", po);
 648out:
 649        spin_unlock(&offload_lock);
 650}
 651
 652/**
 653 *      dev_remove_offload       - remove packet offload handler
 654 *      @po: packet offload declaration
 655 *
 656 *      Remove a packet offload handler that was previously added to the kernel
 657 *      offload handlers by dev_add_offload(). The passed &offload_type is
 658 *      removed from the kernel lists and can be freed or reused once this
 659 *      function returns.
 660 *
 661 *      This call sleeps to guarantee that no CPU is looking at the packet
 662 *      type after return.
 663 */
 664void dev_remove_offload(struct packet_offload *po)
 665{
 666        __dev_remove_offload(po);
 667
 668        synchronize_net();
 669}
 670EXPORT_SYMBOL(dev_remove_offload);
 671
 672/******************************************************************************
 673 *
 674 *                    Device Boot-time Settings Routines
 675 *
 676 ******************************************************************************/
 677
 678/* Boot time configuration table */
 679static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 680
 681/**
 682 *      netdev_boot_setup_add   - add new setup entry
 683 *      @name: name of the device
 684 *      @map: configured settings for the device
 685 *
 686 *      Adds new setup entry to the dev_boot_setup list.  The function
 687 *      returns 0 on error and 1 on success.  This is a generic routine to
 688 *      all netdevices.
 689 */
 690static int netdev_boot_setup_add(char *name, struct ifmap *map)
 691{
 692        struct netdev_boot_setup *s;
 693        int i;
 694
 695        s = dev_boot_setup;
 696        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 697                if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 698                        memset(s[i].name, 0, sizeof(s[i].name));
 699                        strlcpy(s[i].name, name, IFNAMSIZ);
 700                        memcpy(&s[i].map, map, sizeof(s[i].map));
 701                        break;
 702                }
 703        }
 704
 705        return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 706}
 707
 708/**
 709 * netdev_boot_setup_check      - check boot time settings
 710 * @dev: the netdevice
 711 *
 712 * Check boot time settings for the device.
 713 * The found settings are set for the device to be used
 714 * later in the device probing.
 715 * Returns 0 if no settings found, 1 if they are.
 716 */
 717int netdev_boot_setup_check(struct net_device *dev)
 718{
 719        struct netdev_boot_setup *s = dev_boot_setup;
 720        int i;
 721
 722        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 723                if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 724                    !strcmp(dev->name, s[i].name)) {
 725                        dev->irq = s[i].map.irq;
 726                        dev->base_addr = s[i].map.base_addr;
 727                        dev->mem_start = s[i].map.mem_start;
 728                        dev->mem_end = s[i].map.mem_end;
 729                        return 1;
 730                }
 731        }
 732        return 0;
 733}
 734EXPORT_SYMBOL(netdev_boot_setup_check);
 735
 736
 737/**
 738 * netdev_boot_base     - get address from boot time settings
 739 * @prefix: prefix for network device
 740 * @unit: id for network device
 741 *
 742 * Check boot time settings for the base address of device.
 743 * The found settings are set for the device to be used
 744 * later in the device probing.
 745 * Returns 0 if no settings found.
 746 */
 747unsigned long netdev_boot_base(const char *prefix, int unit)
 748{
 749        const struct netdev_boot_setup *s = dev_boot_setup;
 750        char name[IFNAMSIZ];
 751        int i;
 752
 753        sprintf(name, "%s%d", prefix, unit);
 754
 755        /*
 756         * If device already registered then return base of 1
 757         * to indicate not to probe for this interface
 758         */
 759        if (__dev_get_by_name(&init_net, name))
 760                return 1;
 761
 762        for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 763                if (!strcmp(name, s[i].name))
 764                        return s[i].map.base_addr;
 765        return 0;
 766}
 767
 768/*
 769 * Saves at boot time configured settings for any netdevice.
 770 */
 771int __init netdev_boot_setup(char *str)
 772{
 773        int ints[5];
 774        struct ifmap map;
 775
 776        str = get_options(str, ARRAY_SIZE(ints), ints);
 777        if (!str || !*str)
 778                return 0;
 779
 780        /* Save settings */
 781        memset(&map, 0, sizeof(map));
 782        if (ints[0] > 0)
 783                map.irq = ints[1];
 784        if (ints[0] > 1)
 785                map.base_addr = ints[2];
 786        if (ints[0] > 2)
 787                map.mem_start = ints[3];
 788        if (ints[0] > 3)
 789                map.mem_end = ints[4];
 790
 791        /* Add new entry to the list */
 792        return netdev_boot_setup_add(str, &map);
 793}
 794
 795__setup("netdev=", netdev_boot_setup);
 796
 797/*******************************************************************************
 798 *
 799 *                          Device Interface Subroutines
 800 *
 801 *******************************************************************************/
 802
 803/**
 804 *      dev_get_iflink  - get 'iflink' value of a interface
 805 *      @dev: targeted interface
 806 *
 807 *      Indicates the ifindex the interface is linked to.
 808 *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 809 */
 810
 811int dev_get_iflink(const struct net_device *dev)
 812{
 813        if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 814                return dev->netdev_ops->ndo_get_iflink(dev);
 815
 816        return dev->ifindex;
 817}
 818EXPORT_SYMBOL(dev_get_iflink);
 819
 820/**
 821 *      dev_fill_metadata_dst - Retrieve tunnel egress information.
 822 *      @dev: targeted interface
 823 *      @skb: The packet.
 824 *
 825 *      For better visibility of tunnel traffic OVS needs to retrieve
 826 *      egress tunnel information for a packet. Following API allows
 827 *      user to get this info.
 828 */
 829int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 830{
 831        struct ip_tunnel_info *info;
 832
 833        if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 834                return -EINVAL;
 835
 836        info = skb_tunnel_info_unclone(skb);
 837        if (!info)
 838                return -ENOMEM;
 839        if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 840                return -EINVAL;
 841
 842        return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 843}
 844EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 845
 846/**
 847 *      __dev_get_by_name       - find a device by its name
 848 *      @net: the applicable net namespace
 849 *      @name: name to find
 850 *
 851 *      Find an interface by name. Must be called under RTNL semaphore
 852 *      or @dev_base_lock. If the name is found a pointer to the device
 853 *      is returned. If the name is not found then %NULL is returned. The
 854 *      reference counters are not incremented so the caller must be
 855 *      careful with locks.
 856 */
 857
 858struct net_device *__dev_get_by_name(struct net *net, const char *name)
 859{
 860        struct netdev_name_node *node_name;
 861
 862        node_name = netdev_name_node_lookup(net, name);
 863        return node_name ? node_name->dev : NULL;
 864}
 865EXPORT_SYMBOL(__dev_get_by_name);
 866
 867/**
 868 * dev_get_by_name_rcu  - find a device by its name
 869 * @net: the applicable net namespace
 870 * @name: name to find
 871 *
 872 * Find an interface by name.
 873 * If the name is found a pointer to the device is returned.
 874 * If the name is not found then %NULL is returned.
 875 * The reference counters are not incremented so the caller must be
 876 * careful with locks. The caller must hold RCU lock.
 877 */
 878
 879struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 880{
 881        struct netdev_name_node *node_name;
 882
 883        node_name = netdev_name_node_lookup_rcu(net, name);
 884        return node_name ? node_name->dev : NULL;
 885}
 886EXPORT_SYMBOL(dev_get_by_name_rcu);
 887
 888/**
 889 *      dev_get_by_name         - find a device by its name
 890 *      @net: the applicable net namespace
 891 *      @name: name to find
 892 *
 893 *      Find an interface by name. This can be called from any
 894 *      context and does its own locking. The returned handle has
 895 *      the usage count incremented and the caller must use dev_put() to
 896 *      release it when it is no longer needed. %NULL is returned if no
 897 *      matching device is found.
 898 */
 899
 900struct net_device *dev_get_by_name(struct net *net, const char *name)
 901{
 902        struct net_device *dev;
 903
 904        rcu_read_lock();
 905        dev = dev_get_by_name_rcu(net, name);
 906        if (dev)
 907                dev_hold(dev);
 908        rcu_read_unlock();
 909        return dev;
 910}
 911EXPORT_SYMBOL(dev_get_by_name);
 912
 913/**
 914 *      __dev_get_by_index - find a device by its ifindex
 915 *      @net: the applicable net namespace
 916 *      @ifindex: index of device
 917 *
 918 *      Search for an interface by index. Returns %NULL if the device
 919 *      is not found or a pointer to the device. The device has not
 920 *      had its reference counter increased so the caller must be careful
 921 *      about locking. The caller must hold either the RTNL semaphore
 922 *      or @dev_base_lock.
 923 */
 924
 925struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 926{
 927        struct net_device *dev;
 928        struct hlist_head *head = dev_index_hash(net, ifindex);
 929
 930        hlist_for_each_entry(dev, head, index_hlist)
 931                if (dev->ifindex == ifindex)
 932                        return dev;
 933
 934        return NULL;
 935}
 936EXPORT_SYMBOL(__dev_get_by_index);
 937
 938/**
 939 *      dev_get_by_index_rcu - find a device by its ifindex
 940 *      @net: the applicable net namespace
 941 *      @ifindex: index of device
 942 *
 943 *      Search for an interface by index. Returns %NULL if the device
 944 *      is not found or a pointer to the device. The device has not
 945 *      had its reference counter increased so the caller must be careful
 946 *      about locking. The caller must hold RCU lock.
 947 */
 948
 949struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 950{
 951        struct net_device *dev;
 952        struct hlist_head *head = dev_index_hash(net, ifindex);
 953
 954        hlist_for_each_entry_rcu(dev, head, index_hlist)
 955                if (dev->ifindex == ifindex)
 956                        return dev;
 957
 958        return NULL;
 959}
 960EXPORT_SYMBOL(dev_get_by_index_rcu);
 961
 962
 963/**
 964 *      dev_get_by_index - find a device by its ifindex
 965 *      @net: the applicable net namespace
 966 *      @ifindex: index of device
 967 *
 968 *      Search for an interface by index. Returns NULL if the device
 969 *      is not found or a pointer to the device. The device returned has
 970 *      had a reference added and the pointer is safe until the user calls
 971 *      dev_put to indicate they have finished with it.
 972 */
 973
 974struct net_device *dev_get_by_index(struct net *net, int ifindex)
 975{
 976        struct net_device *dev;
 977
 978        rcu_read_lock();
 979        dev = dev_get_by_index_rcu(net, ifindex);
 980        if (dev)
 981                dev_hold(dev);
 982        rcu_read_unlock();
 983        return dev;
 984}
 985EXPORT_SYMBOL(dev_get_by_index);
 986
 987/**
 988 *      dev_get_by_napi_id - find a device by napi_id
 989 *      @napi_id: ID of the NAPI struct
 990 *
 991 *      Search for an interface by NAPI ID. Returns %NULL if the device
 992 *      is not found or a pointer to the device. The device has not had
 993 *      its reference counter increased so the caller must be careful
 994 *      about locking. The caller must hold RCU lock.
 995 */
 996
 997struct net_device *dev_get_by_napi_id(unsigned int napi_id)
 998{
 999        struct napi_struct *napi;
1000

1001        WARN_ON_ONCE(!rcu_read_lock_held());
1002
1003        if (napi_id < MIN_NAPI_ID)
1004                return NULL;
1005
1006        napi = napi_by_id(napi_id);
1007
1008        return napi ? napi->dev : NULL;
1009}
1010EXPORT_SYMBOL(dev_get_by_napi_id);
1011
1012/**
1013 *      netdev_get_name - get a netdevice name, knowing its ifindex.
1014 *      @net: network namespace
1015 *      @name: a pointer to the buffer where the name will be stored.
1016 *      @ifindex: the ifindex of the interface to get the name from.
1017 */
1018int netdev_get_name(struct net *net, char *name, int ifindex)
1019{
1020        struct net_device *dev;
1021        int ret;
1022
1023        down_read(&devnet_rename_sem);
1024        rcu_read_lock();
1025
1026        dev = dev_get_by_index_rcu(net, ifindex);
1027        if (!dev) {
1028                ret = -ENODEV;
1029                goto out;
1030        }
1031
1032        strcpy(name, dev->name);
1033
1034        ret = 0;
1035out:
1036        rcu_read_unlock();
1037        up_read(&devnet_rename_sem);
1038        return ret;
1039}
1040
1041/**
1042 *      dev_getbyhwaddr_rcu - find a device by its hardware address
1043 *      @net: the applicable net namespace
1044 *      @type: media type of device
1045 *      @ha: hardware address
1046 *
1047 *      Search for an interface by MAC address. Returns NULL if the device
1048 *      is not found or a pointer to the device.
1049 *      The caller must hold RCU or RTNL.
1050 *      The returned device has not had its ref count increased
1051 *      and the caller must therefore be careful about locking
1052 *
1053 */
1054
1055struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
1056                                       const char *ha)
1057{
1058        struct net_device *dev;
1059
1060        for_each_netdev_rcu(net, dev)
1061                if (dev->type == type &&
1062                    !memcmp(dev->dev_addr, ha, dev->addr_len))
1063                        return dev;
1064
1065        return NULL;
1066}
1067EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
1068
1069struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1070{
1071        struct net_device *dev;
1072
1073        ASSERT_RTNL();
1074        for_each_netdev(net, dev)
1075                if (dev->type == type)
1076                        return dev;
1077
1078        return NULL;
1079}
1080EXPORT_SYMBOL(__dev_getfirstbyhwtype);
1081
1082struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
1083{
1084        struct net_device *dev, *ret = NULL;
1085
1086        rcu_read_lock();
1087        for_each_netdev_rcu(net, dev)
1088                if (dev->type == type) {
1089                        dev_hold(dev);
1090                        ret = dev;
1091                        break;
1092                }
1093        rcu_read_unlock();
1094        return ret;
1095}
1096EXPORT_SYMBOL(dev_getfirstbyhwtype);
1097
1098/**
1099 *      __dev_get_by_flags - find any device with given flags
1100 *      @net: the applicable net namespace
1101 *      @if_flags: IFF_* values
1102 *      @mask: bitmask of bits in if_flags to check
1103 *
1104 *      Search for any interface with the given flags. Returns NULL if a device
1105 *      is not found or a pointer to the device. Must be called inside
1106 *      rtnl_lock(), and result refcount is unchanged.
1107 */
1108
1109struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
1110                                      unsigned short mask)
1111{
1112        struct net_device *dev, *ret;
1113
1114        ASSERT_RTNL();
1115
1116        ret = NULL;
1117        for_each_netdev(net, dev) {
1118                if (((dev->flags ^ if_flags) & mask) == 0) {
1119                        ret = dev;
1120                        break;
1121                }
1122        }
1123        return ret;
1124}
1125EXPORT_SYMBOL(__dev_get_by_flags);
1126
1127/**
1128 *      dev_valid_name - check if name is okay for network device
1129 *      @name: name string
1130 *
1131 *      Network device names need to be valid file names to
1132 *      to allow sysfs to work.  We also disallow any kind of
1133 *      whitespace.
1134 */
1135bool dev_valid_name(const char *name)
1136{
1137        if (*name == '\0')
1138                return false;
1139        if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1140                return false;
1141        if (!strcmp(name, ".") || !strcmp(name, ".."))
1142                return false;
1143
1144        while (*name) {
1145                if (*name == '/' || *name == ':' || isspace(*name))
1146                        return false;
1147                name++;
1148        }
1149        return true;
1150}
1151EXPORT_SYMBOL(dev_valid_name);
1152
1153/**
1154 *      __dev_alloc_name - allocate a name for a device
1155 *      @net: network namespace to allocate the device name in
1156 *      @name: name format string
1157 *      @buf:  scratch buffer and result name string
1158 *
1159 *      Passed a format string - eg "lt%d" it will try and find a suitable
1160 *      id. It scans list of devices to build up a free map, then chooses
1161 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1162 *      while allocating the name and adding the device in order to avoid
1163 *      duplicates.
1164 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1165 *      Returns the number of the unit assigned or a negative errno code.
1166 */
1167
1168static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1169{
1170        int i = 0;
1171        const char *p;
1172        const int max_netdevices = 8*PAGE_SIZE;
1173        unsigned long *inuse;
1174        struct net_device *d;
1175
1176        if (!dev_valid_name(name))
1177                return -EINVAL;
1178
1179        p = strchr(name, '%');
1180        if (p) {
1181                /*
1182                 * Verify the string as this thing may have come from
1183                 * the user.  There must be either one "%d" and no other "%"
1184                 * characters.
1185                 */
1186                if (p[1] != 'd' || strchr(p + 2, '%'))
1187                        return -EINVAL;
1188
1189                /* Use one page as a bit array of possible slots */
1190                inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1191                if (!inuse)
1192                        return -ENOMEM;
1193
1194                for_each_netdev(net, d) {
1195                        if (!sscanf(d->name, name, &i))
1196                                continue;
1197                        if (i < 0 || i >= max_netdevices)
1198                                continue;
1199
1200                        /*  avoid cases where sscanf is not exact inverse of printf */
1201                        snprintf(buf, IFNAMSIZ, name, i);
1202                        if (!strncmp(buf, d->name, IFNAMSIZ))
1203                                set_bit(i, inuse);
1204                }
1205
1206                i = find_first_zero_bit(inuse, max_netdevices);
1207                free_page((unsigned long) inuse);
1208        }
1209
1210        snprintf(buf, IFNAMSIZ, name, i);
1211        if (!__dev_get_by_name(net, buf))
1212                return i;
1213
1214        /* It is possible to run out of possible slots
1215         * when the name is long and there isn't enough space left
1216         * for the digits, or if all bits are used.
1217         */
1218        return -ENFILE;
1219}
1220
1221static int dev_alloc_name_ns(struct net *net,
1222                             struct net_device *dev,
1223                             const char *name)
1224{
1225        char buf[IFNAMSIZ];
1226        int ret;
1227
1228        BUG_ON(!net);
1229        ret = __dev_alloc_name(net, name, buf);
1230        if (ret >= 0)
1231                strlcpy(dev->name, buf, IFNAMSIZ);
1232        return ret;
1233}
1234
1235/**
1236 *      dev_alloc_name - allocate a name for a device
1237 *      @dev: device
1238 *      @name: name format string
1239 *
1240 *      Passed a format string - eg "lt%d" it will try and find a suitable
1241 *      id. It scans list of devices to build up a free map, then chooses
1242 *      the first empty slot. The caller must hold the dev_base or rtnl lock
1243 *      while allocating the name and adding the device in order to avoid
1244 *      duplicates.
1245 *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1246 *      Returns the number of the unit assigned or a negative errno code.
1247 */
1248
1249int dev_alloc_name(struct net_device *dev, const char *name)
1250{
1251        return dev_alloc_name_ns(dev_net(dev), dev, name);
1252}
1253EXPORT_SYMBOL(dev_alloc_name);
1254
1255static int dev_get_valid_name(struct net *net, struct net_device *dev,
1256                              const char *name)
1257{
1258        BUG_ON(!net);
1259
1260        if (!dev_valid_name(name))
1261                return -EINVAL;
1262
1263        if (strchr(name, '%'))
1264                return dev_alloc_name_ns(net, dev, name);
1265        else if (__dev_get_by_name(net, name))
1266                return -EEXIST;
1267        else if (dev->name != name)
1268                strlcpy(dev->name, name, IFNAMSIZ);
1269
1270        return 0;
1271}
1272
1273/**
1274 *      dev_change_name - change name of a device
1275 *      @dev: device
1276 *      @newname: name (or format string) must be at least IFNAMSIZ
1277 *
1278 *      Change name of a device, can pass format strings "eth%d".
1279 *      for wildcarding.
1280 */
1281int dev_change_name(struct net_device *dev, const char *newname)
1282{
1283        unsigned char old_assign_type;
1284        char oldname[IFNAMSIZ];
1285        int err = 0;
1286        int ret;
1287        struct net *net;
1288
1289        ASSERT_RTNL();
1290        BUG_ON(!dev_net(dev));
1291
1292        net = dev_net(dev);
1293
1294        /* Some auto-enslaved devices e.g. failover slaves are
1295         * special, as userspace might rename the device after
1296         * the interface had been brought up and running since
1297         * the point kernel initiated auto-enslavement. Allow
1298         * live name change even when these slave devices are
1299         * up and running.
1300         *
1301         * Typically, users of these auto-enslaving devices
1302         * don't actually care about slave name change, as
1303         * they are supposed to operate on master interface
1304         * directly.
1305         */
1306        if (dev->flags & IFF_UP &&
1307            likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
1308                return -EBUSY;
1309
1310        down_write(&devnet_rename_sem);
1311
1312        if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1313                up_write(&devnet_rename_sem);
1314                return 0;
1315        }
1316
1317        memcpy(oldname, dev->name, IFNAMSIZ);
1318
1319        err = dev_get_valid_name(net, dev, newname);
1320        if (err < 0) {
1321                up_write(&devnet_rename_sem);
1322                return err;
1323        }
1324
1325        if (oldname[0] && !strchr(oldname, '%'))
1326                netdev_info(dev, "renamed from %s\n", oldname);
1327
1328        old_assign_type = dev->name_assign_type;
1329        dev->name_assign_type = NET_NAME_RENAMED;
1330
1331rollback:
1332        ret = device_rename(&dev->dev, dev->name);
1333        if (ret) {
1334                memcpy(dev->name, oldname, IFNAMSIZ);
1335                dev->name_assign_type = old_assign_type;
1336                up_write(&devnet_rename_sem);
1337                return ret;
1338        }
1339
1340        up_write(&devnet_rename_sem);
1341
1342        netdev_adjacent_rename_links(dev, oldname);
1343
1344        write_lock_bh(&dev_base_lock);
1345        netdev_name_node_del(dev->name_node);
1346        write_unlock_bh(&dev_base_lock);
1347
1348        synchronize_rcu();
1349
1350        write_lock_bh(&dev_base_lock);
1351        netdev_name_node_add(net, dev->name_node);
1352        write_unlock_bh(&dev_base_lock);
1353
1354        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1355        ret = notifier_to_errno(ret);
1356
1357        if (ret) {
1358                /* err >= 0 after dev_alloc_name() or stores the first errno */
1359                if (err >= 0) {
1360                        err = ret;
1361                        down_write(&devnet_rename_sem);
1362                        memcpy(dev->name, oldname, IFNAMSIZ);
1363                        memcpy(oldname, newname, IFNAMSIZ);
1364                        dev->name_assign_type = old_assign_type;
1365                        old_assign_type = NET_NAME_RENAMED;
1366                        goto rollback;
1367                } else {
1368                        pr_err("%s: name change rollback failed: %d\n",
1369                               dev->name, ret);
1370                }
1371        }
1372
1373        return err;
1374}
1375
1376/**
1377 *      dev_set_alias - change ifalias of a device
1378 *      @dev: device
1379 *      @alias: name up to IFALIASZ
1380 *      @len: limit of bytes to copy from info
1381 *
1382 *      Set ifalias for a device,
1383 */
1384int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1385{
1386        struct dev_ifalias *new_alias = NULL;
1387
1388        if (len >= IFALIASZ)
1389                return -EINVAL;
1390
1391        if (len) {
1392                new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1393                if (!new_alias)
1394                        return -ENOMEM;
1395
1396                memcpy(new_alias->ifalias, alias, len);
1397                new_alias->ifalias[len] = 0;
1398        }
1399
1400        mutex_lock(&ifalias_mutex);
1401        new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
1402                                        mutex_is_locked(&ifalias_mutex));
1403        mutex_unlock(&ifalias_mutex);
1404
1405        if (new_alias)
1406                kfree_rcu(new_alias, rcuhead);
1407
1408        return len;
1409}
1410EXPORT_SYMBOL(dev_set_alias);
1411
1412/**
1413 *      dev_get_alias - get ifalias of a device
1414 *      @dev: device
1415 *      @name: buffer to store name of ifalias
1416 *      @len: size of buffer
1417 *
1418 *      get ifalias for a device.  Caller must make sure dev cannot go
1419 *      away,  e.g. rcu read lock or own a reference count to device.
1420 */
1421int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1422{
1423        const struct dev_ifalias *alias;
1424        int ret = 0;
1425
1426        rcu_read_lock();
1427        alias = rcu_dereference(dev->ifalias);
1428        if (alias)
1429                ret = snprintf(name, len, "%s", alias->ifalias);
1430        rcu_read_unlock();
1431
1432        return ret;
1433}
1434
1435/**
1436 *      netdev_features_change - device changes features
1437 *      @dev: device to cause notification
1438 *
1439 *      Called to indicate a device has changed features.
1440 */
1441void netdev_features_change(struct net_device *dev)
1442{
1443        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1444}
1445EXPORT_SYMBOL(netdev_features_change);
1446
1447/**
1448 *      netdev_state_change - device changes state
1449 *      @dev: device to cause notification
1450 *
1451 *      Called to indicate a device has changed state. This function calls
1452 *      the notifier chains for netdev_chain and sends a NEWLINK message
1453 *      to the routing socket.
1454 */
1455void netdev_state_change(struct net_device *dev)
1456{
1457        if (dev->flags & IFF_UP) {
1458                struct netdev_notifier_change_info change_info = {
1459                        .info.dev = dev,
1460                };
1461
1462                call_netdevice_notifiers_info(NETDEV_CHANGE,
1463                                              &change_info.info);
1464                rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1465        }
1466}
1467EXPORT_SYMBOL(netdev_state_change);
1468
1469/**
1470 * netdev_notify_peers - notify network peers about existence of @dev
1471 * @dev: network device
1472 *
1473 * Generate traffic such that interested network peers are aware of
1474 * @dev, such as by generating a gratuitous ARP. This may be used when
1475 * a device wants to inform the rest of the network about some sort of
1476 * reconfiguration such as a failover event or virtual machine
1477 * migration.
1478 */
1479void netdev_notify_peers(struct net_device *dev)
1480{
1481        rtnl_lock();
1482        call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1483        call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1484        rtnl_unlock();
1485}
1486EXPORT_SYMBOL(netdev_notify_peers);
1487
1488static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1489{
1490        const struct net_device_ops *ops = dev->netdev_ops;
1491        int ret;
1492
1493        ASSERT_RTNL();
1494
1495        if (!netif_device_present(dev))
1496                return -ENODEV;
1497
1498        /* Block netpoll from trying to do any rx path servicing.
1499         * If we don't do this there is a chance ndo_poll_controller
1500         * or ndo_poll may be running while we open the device
1501         */
1502        netpoll_poll_disable(dev);
1503
1504        ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
1505        ret = notifier_to_errno(ret);
1506        if (ret)
1507                return ret;
1508
1509        set_bit(__LINK_STATE_START, &dev->state);
1510
1511        if (ops->ndo_validate_addr)
1512                ret = ops->ndo_validate_addr(dev);
1513
1514        if (!ret && ops->ndo_open)
1515                ret = ops->ndo_open(dev);
1516
1517        netpoll_poll_enable(dev);
1518
1519        if (ret)
1520                clear_bit(__LINK_STATE_START, &dev->state);
1521        else {
1522                dev->flags |= IFF_UP;
1523                dev_set_rx_mode(dev);
1524                dev_activate(dev);
1525                add_device_randomness(dev->dev_addr, dev->addr_len);
1526        }
1527
1528        return ret;
1529}
1530
1531/**
1532 *      dev_open        - prepare an interface for use.
1533 *      @dev: device to open
1534 *      @extack: netlink extended ack
1535 *
1536 *      Takes a device from down to up state. The device's private open
1537 *      function is invoked and then the multicast lists are loaded. Finally
1538 *      the device is moved into the up state and a %NETDEV_UP message is
1539 *      sent to the netdev notifier chain.
1540 *
1541 *      Calling this function on an active interface is a nop. On a failure
1542 *      a negative errno code is returned.
1543 */
1544int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1545{
1546        int ret;
1547
1548        if (dev->flags & IFF_UP)
1549                return 0;
1550
1551        ret = __dev_open(dev, extack);
1552        if (ret < 0)
1553                return ret;
1554
1555        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1556        call_netdevice_notifiers(NETDEV_UP, dev);
1557
1558        return ret;
1559}
1560EXPORT_SYMBOL(dev_open);
1561
1562static void __dev_close_many(struct list_head *head)
1563{
1564        struct net_device *dev;
1565
1566        ASSERT_RTNL();
1567        might_sleep();
1568
1569        list_for_each_entry(dev, head, close_list) {
1570                /* Temporarily disable netpoll until the interface is down */
1571                netpoll_poll_disable(dev);
1572
1573                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1574
1575                clear_bit(__LINK_STATE_START, &dev->state);
1576
1577                /* Synchronize to scheduled poll. We cannot touch poll list, it
1578                 * can be even on different cpu. So just clear netif_running().
1579                 *
1580                 * dev->stop() will invoke napi_disable() on all of it's
1581                 * napi_struct instances on this device.
1582                 */
1583                smp_mb__after_atomic(); /* Commit netif_running(). */
1584        }
1585
1586        dev_deactivate_many(head);
1587
1588        list_for_each_entry(dev, head, close_list) {
1589                const struct net_device_ops *ops = dev->netdev_ops;
1590
1591                /*
1592                 *      Call the device specific close. This cannot fail.
1593                 *      Only if device is UP
1594                 *
1595                 *      We allow it to be called even after a DETACH hot-plug
1596                 *      event.
1597                 */
1598                if (ops->ndo_stop)
1599                        ops->ndo_stop(dev);
1600
1601                dev->flags &= ~IFF_UP;
1602                netpoll_poll_enable(dev);
1603        }
1604}
1605
1606static void __dev_close(struct net_device *dev)
1607{
1608        LIST_HEAD(single);
1609
1610        list_add(&dev->close_list, &single);
1611        __dev_close_many(&single);
1612        list_del(&single);
1613}
1614
1615void dev_close_many(struct list_head *head, bool unlink)
1616{
1617        struct net_device *dev, *tmp;
1618
1619        /* Remove the devices that don't need to be closed */
1620        list_for_each_entry_safe(dev, tmp, head, close_list)
1621                if (!(dev->flags & IFF_UP))
1622                        list_del_init(&dev->close_list);
1623
1624        __dev_close_many(head);
1625
1626        list_for_each_entry_safe(dev, tmp, head, close_list) {
1627                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1628                call_netdevice_notifiers(NETDEV_DOWN, dev);
1629                if (unlink)
1630                        list_del_init(&dev->close_list);
1631        }
1632}
1633EXPORT_SYMBOL(dev_close_many);
1634
1635/**
1636 *      dev_close - shutdown an interface.
1637 *      @dev: device to shutdown
1638 *
1639 *      This function moves an active device into down state. A
1640 *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1641 *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1642 *      chain.
1643 */
1644void dev_close(struct net_device *dev)
1645{
1646        if (dev->flags & IFF_UP) {
1647                LIST_HEAD(single);
1648
1649                list_add(&dev->close_list, &single);
1650                dev_close_many(&single, true);
1651                list_del(&single);
1652        }
1653}
1654EXPORT_SYMBOL(dev_close);
1655
1656
1657/**
1658 *      dev_disable_lro - disable Large Receive Offload on a device
1659 *      @dev: device
1660 *
1661 *      Disable Large Receive Offload (LRO) on a net device.  Must be
1662 *      called under RTNL.  This is needed if received packets may be
1663 *      forwarded to another interface.
1664 */
1665void dev_disable_lro(struct net_device *dev)
1666{
1667        struct net_device *lower_dev;
1668        struct list_head *iter;
1669
1670        dev->wanted_features &= ~NETIF_F_LRO;
1671        netdev_update_features(dev);
1672
1673        if (unlikely(dev->features & NETIF_F_LRO))
1674                netdev_WARN(dev, "failed to disable LRO!\n");
1675
1676        netdev_for_each_lower_dev(dev, lower_dev, iter)
1677                dev_disable_lro(lower_dev);
1678}
1679EXPORT_SYMBOL(dev_disable_lro);
1680
1681/**
1682 *      dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1683 *      @dev: device
1684 *
1685 *      Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
1686 *      called under RTNL.  This is needed if Generic XDP is installed on
1687 *      the device.
1688 */
1689static void dev_disable_gro_hw(struct net_device *dev)
1690{
1691        dev->wanted_features &= ~NETIF_F_GRO_HW;
1692        netdev_update_features(dev);
1693
1694        if (unlikely(dev->features & NETIF_F_GRO_HW))
1695                netdev_WARN(dev, "failed to disable GRO_HW!\n");
1696}
1697
1698const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1699{
1700#define N(val)                                          \
1701        case NETDEV_##val:                              \
1702                return "NETDEV_" __stringify(val);
1703        switch (cmd) {
1704        N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1705        N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1706        N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1707        N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
1708        N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
1709        N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
1710        N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1711        N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1712        N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1713        N(PRE_CHANGEADDR)
1714        }
1715#undef N
1716        return "UNKNOWN_NETDEV_EVENT";
1717}
1718EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1719
1720static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1721                                   struct net_device *dev)
1722{
1723        struct netdev_notifier_info info = {
1724                .dev = dev,
1725        };
1726
1727        return nb->notifier_call(nb, val, &info);
1728}
1729
1730static int call_netdevice_register_notifiers(struct notifier_block *nb,
1731                                             struct net_device *dev)
1732{
1733        int err;
1734
1735        err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1736        err = notifier_to_errno(err);
1737        if (err)
1738                return err;
1739
1740        if (!(dev->flags & IFF_UP))
1741                return 0;
1742
1743        call_netdevice_notifier(nb, NETDEV_UP, dev);
1744        return 0;
1745}
1746
1747static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
1748                                                struct net_device *dev)
1749{
1750        if (dev->flags & IFF_UP) {
1751                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1752                                        dev);
1753                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1754        }
1755        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1756}
1757
1758static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
1759                                                 struct net *net)
1760{
1761        struct net_device *dev;
1762        int err;
1763
1764        for_each_netdev(net, dev) {
1765                err = call_netdevice_register_notifiers(nb, dev);
1766                if (err)
1767                        goto rollback;
1768        }
1769        return 0;
1770
1771rollback:
1772        for_each_netdev_continue_reverse(net, dev)
1773                call_netdevice_unregister_notifiers(nb, dev);
1774        return err;
1775}
1776
1777static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
1778                                                    struct net *net)
1779{
1780        struct net_device *dev;
1781
1782        for_each_netdev(net, dev)
1783                call_netdevice_unregister_notifiers(nb, dev);
1784}
1785
1786static int dev_boot_phase = 1;
1787
1788/**
1789 * register_netdevice_notifier - register a network notifier block
1790 * @nb: notifier
1791 *
1792 * Register a notifier to be called when network device events occur.
1793 * The notifier passed is linked into the kernel structures and must
1794 * not be reused until it has been unregistered. A negative errno code
1795 * is returned on a failure.
1796 *
1797 * When registered all registration and up events are replayed
1798 * to the new notifier to allow device to have a race free
1799 * view of the network device list.
1800 */
1801
1802int register_netdevice_notifier(struct notifier_block *nb)
1803{
1804        struct net *net;
1805        int err;
1806
1807        /* Close race with setup_net() and cleanup_net() */
1808        down_write(&pernet_ops_rwsem);
1809        rtnl_lock();
1810        err = raw_notifier_chain_register(&netdev_chain, nb);
1811        if (err)
1812                goto unlock;
1813        if (dev_boot_phase)
1814                goto unlock;
1815        for_each_net(net) {
1816                err = call_netdevice_register_net_notifiers(nb, net);
1817                if (err)
1818                        goto rollback;
1819        }
1820
1821unlock:
1822        rtnl_unlock();
1823        up_write(&pernet_ops_rwsem);
1824        return err;
1825
1826rollback:
1827        for_each_net_continue_reverse(net)
1828                call_netdevice_unregister_net_notifiers(nb, net);
1829
1830        raw_notifier_chain_unregister(&netdev_chain, nb);
1831        goto unlock;
1832}
1833EXPORT_SYMBOL(register_netdevice_notifier);
1834
1835/**
1836 * unregister_netdevice_notifier - unregister a network notifier block
1837 * @nb: notifier
1838 *
1839 * Unregister a notifier previously registered by
1840 * register_netdevice_notifier(). The notifier is unlinked into the
1841 * kernel structures and may then be reused. A negative errno code
1842 * is returned on a failure.
1843 *
1844 * After unregistering unregister and down device events are synthesized
1845 * for all devices on the device list to the removed notifier to remove
1846 * the need for special case cleanup code.
1847 */
1848
1849int unregister_netdevice_notifier(struct notifier_block *nb)
1850{
1851        struct net *net;
1852        int err;
1853
1854        /* Close race with setup_net() and cleanup_net() */
1855        down_write(&pernet_ops_rwsem);
1856        rtnl_lock();
1857        err = raw_notifier_chain_unregister(&netdev_chain, nb);
1858        if (err)
1859                goto unlock;
1860
1861        for_each_net(net)
1862                call_netdevice_unregister_net_notifiers(nb, net);
1863
1864unlock:
1865        rtnl_unlock();
1866        up_write(&pernet_ops_rwsem);
1867        return err;
1868}
1869EXPORT_SYMBOL(unregister_netdevice_notifier);
1870
1871static int __register_netdevice_notifier_net(struct net *net,
1872                                             struct notifier_block *nb,
1873                                             bool ignore_call_fail)
1874{
1875        int err;
1876
1877        err = raw_notifier_chain_register(&net->netdev_chain, nb);
1878        if (err)
1879                return err;
1880        if (dev_boot_phase)
1881                return 0;
1882
1883        err = call_netdevice_register_net_notifiers(nb, net);
1884        if (err && !ignore_call_fail)
1885                goto chain_unregister;
1886
1887        return 0;
1888
1889chain_unregister:
1890        raw_notifier_chain_unregister(&net->netdev_chain, nb);
1891        return err;
1892}
1893
1894static int __unregister_netdevice_notifier_net(struct net *net,
1895                                               struct notifier_block *nb)
1896{
1897        int err;
1898
1899        err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
1900        if (err)
1901                return err;
1902
1903        call_netdevice_unregister_net_notifiers(nb, net);
1904        return 0;
1905}
1906
1907/**
1908 * register_netdevice_notifier_net - register a per-netns network notifier block
1909 * @net: network namespace
1910 * @nb: notifier
1911 *
1912 * Register a notifier to be called when network device events occur.
1913 * The notifier passed is linked into the kernel structures and must
1914 * not be reused until it has been unregistered. A negative errno code
1915 * is returned on a failure.
1916 *
1917 * When registered all registration and up events are replayed
1918 * to the new notifier to allow device to have a race free
1919 * view of the network device list.
1920 */
1921
1922int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
1923{
1924        int err;
1925
1926        rtnl_lock();
1927        err = __register_netdevice_notifier_net(net, nb, false);
1928        rtnl_unlock();
1929        return err;
1930}
1931EXPORT_SYMBOL(register_netdevice_notifier_net);
1932
1933/**
1934 * unregister_netdevice_notifier_net - unregister a per-netns
1935 *                                     network notifier block
1936 * @net: network namespace
1937 * @nb: notifier
1938 *
1939 * Unregister a notifier previously registered by
1940 * register_netdevice_notifier(). The notifier is unlinked into the
1941 * kernel structures and may then be reused. A negative errno code
1942 * is returned on a failure.
1943 *
1944 * After unregistering unregister and down device events are synthesized
1945 * for all devices on the device list to the removed notifier to remove
1946 * the need for special case cleanup code.
1947 */
1948
1949int unregister_netdevice_notifier_net(struct net *net,
1950                                      struct notifier_block *nb)
1951{
1952        int err;
1953
1954        rtnl_lock();
1955        err = __unregister_netdevice_notifier_net(net, nb);
1956        rtnl_unlock();
1957        return err;
1958}
1959EXPORT_SYMBOL(unregister_netdevice_notifier_net);
1960
1961int register_netdevice_notifier_dev_net(struct net_device *dev,
1962                                        struct notifier_block *nb,
1963                                        struct netdev_net_notifier *nn)
1964{
1965        int err;
1966
1967        rtnl_lock();
1968        err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
1969        if (!err) {
1970                nn->nb = nb;
1971                list_add(&nn->list, &dev->net_notifier_list);
1972        }
1973        rtnl_unlock();
1974        return err;
1975}
1976EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
1977
1978int unregister_netdevice_notifier_dev_net(struct net_device *dev,
1979                                          struct notifier_block *nb,
1980                                          struct netdev_net_notifier *nn)
1981{
1982        int err;
1983
1984        rtnl_lock();
1985        list_del(&nn->list);
1986        err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
1987        rtnl_unlock();
1988        return err;
1989}
1990EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
1991
1992static void move_netdevice_notifiers_dev_net(struct net_device *dev,
1993                                             struct net *net)
1994{
1995        struct netdev_net_notifier *nn;
1996
1997        list_for_each_entry(nn, &dev->net_notifier_list, list) {
1998                __unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
1999                __register_netdevice_notifier_net(net, nn->nb, true);
2000        }

2001}
2002
2003/**
2004 *      call_netdevice_notifiers_info - call all network notifier blocks
2005 *      @val: value passed unmodified to notifier function
2006 *      @info: notifier information data
2007 *
2008 *      Call all network notifier blocks.  Parameters and return value
2009 *      are as for raw_notifier_call_chain().
2010 */
2011
2012static int call_netdevice_notifiers_info(unsigned long val,
2013                                         struct netdev_notifier_info *info)
2014{
2015        struct net *net = dev_net(info->dev);
2016        int ret;
2017
2018        ASSERT_RTNL();
2019
2020        /* Run per-netns notifier block chain first, then run the global one.
2021         * Hopefully, one day, the global one is going to be removed after
2022         * all notifier block registrators get converted to be per-netns.
2023         */
2024        ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
2025        if (ret & NOTIFY_STOP_MASK)
2026                return ret;
2027        return raw_notifier_call_chain(&netdev_chain, val, info);
2028}
2029
2030static int call_netdevice_notifiers_extack(unsigned long val,
2031                                           struct net_device *dev,
2032                                           struct netlink_ext_ack *extack)
2033{
2034        struct netdev_notifier_info info = {
2035                .dev = dev,
2036                .extack = extack,
2037        };
2038
2039        return call_netdevice_notifiers_info(val, &info);
2040}
2041
2042/**
2043 *      call_netdevice_notifiers - call all network notifier blocks
2044 *      @val: value passed unmodified to notifier function
2045 *      @dev: net_device pointer passed unmodified to notifier function
2046 *
2047 *      Call all network notifier blocks.  Parameters and return value
2048 *      are as for raw_notifier_call_chain().
2049 */
2050
2051int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
2052{
2053        return call_netdevice_notifiers_extack(val, dev, NULL);
2054}
2055EXPORT_SYMBOL(call_netdevice_notifiers);
2056
2057/**
2058 *      call_netdevice_notifiers_mtu - call all network notifier blocks
2059 *      @val: value passed unmodified to notifier function
2060 *      @dev: net_device pointer passed unmodified to notifier function
2061 *      @arg: additional u32 argument passed to the notifier function
2062 *
2063 *      Call all network notifier blocks.  Parameters and return value
2064 *      are as for raw_notifier_call_chain().
2065 */
2066static int call_netdevice_notifiers_mtu(unsigned long val,
2067                                        struct net_device *dev, u32 arg)
2068{
2069        struct netdev_notifier_info_ext info = {
2070                .info.dev = dev,
2071                .ext.mtu = arg,
2072        };
2073
2074        BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
2075
2076        return call_netdevice_notifiers_info(val, &info.info);
2077}
2078
2079#ifdef CONFIG_NET_INGRESS
2080static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
2081
2082void net_inc_ingress_queue(void)
2083{
2084        static_branch_inc(&ingress_needed_key);
2085}
2086EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
2087
2088void net_dec_ingress_queue(void)
2089{
2090        static_branch_dec(&ingress_needed_key);
2091}
2092EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
2093#endif
2094
2095#ifdef CONFIG_NET_EGRESS
2096static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
2097
2098void net_inc_egress_queue(void)
2099{
2100        static_branch_inc(&egress_needed_key);
2101}
2102EXPORT_SYMBOL_GPL(net_inc_egress_queue);
2103
2104void net_dec_egress_queue(void)
2105{
2106        static_branch_dec(&egress_needed_key);
2107}
2108EXPORT_SYMBOL_GPL(net_dec_egress_queue);
2109#endif
2110
2111static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
2112#ifdef CONFIG_JUMP_LABEL
2113static atomic_t netstamp_needed_deferred;
2114static atomic_t netstamp_wanted;
2115static void netstamp_clear(struct work_struct *work)
2116{
2117        int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
2118        int wanted;
2119
2120        wanted = atomic_add_return(deferred, &netstamp_wanted);
2121        if (wanted > 0)
2122                static_branch_enable(&netstamp_needed_key);
2123        else
2124                static_branch_disable(&netstamp_needed_key);
2125}
2126static DECLARE_WORK(netstamp_work, netstamp_clear);
2127#endif
2128
2129void net_enable_timestamp(void)
2130{
2131#ifdef CONFIG_JUMP_LABEL
2132        int wanted;
2133
2134        while (1) {
2135                wanted = atomic_read(&netstamp_wanted);
2136                if (wanted <= 0)
2137                        break;
2138                if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
2139                        return;
2140        }
2141        atomic_inc(&netstamp_needed_deferred);
2142        schedule_work(&netstamp_work);
2143#else
2144        static_branch_inc(&netstamp_needed_key);
2145#endif
2146}
2147EXPORT_SYMBOL(net_enable_timestamp);
2148
2149void net_disable_timestamp(void)
2150{
2151#ifdef CONFIG_JUMP_LABEL
2152        int wanted;
2153
2154        while (1) {
2155                wanted = atomic_read(&netstamp_wanted);
2156                if (wanted <= 1)
2157                        break;
2158                if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
2159                        return;
2160        }
2161        atomic_dec(&netstamp_needed_deferred);
2162        schedule_work(&netstamp_work);
2163#else
2164        static_branch_dec(&netstamp_needed_key);
2165#endif
2166}
2167EXPORT_SYMBOL(net_disable_timestamp);
2168
2169static inline void net_timestamp_set(struct sk_buff *skb)
2170{
2171        skb->tstamp = 0;
2172        if (static_branch_unlikely(&netstamp_needed_key))
2173                __net_timestamp(skb);
2174}
2175
2176#define net_timestamp_check(COND, SKB)                          \
2177        if (static_branch_unlikely(&netstamp_needed_key)) {     \
2178                if ((COND) && !(SKB)->tstamp)                   \
2179                        __net_timestamp(SKB);                   \
2180        }                                                       \
2181
2182bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
2183{
2184        unsigned int len;
2185
2186        if (!(dev->flags & IFF_UP))
2187                return false;
2188
2189        len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
2190        if (skb->len <= len)
2191                return true;
2192
2193        /* if TSO is enabled, we don't care about the length as the packet
2194         * could be forwarded without being segmented before
2195         */
2196        if (skb_is_gso(skb))
2197                return true;
2198
2199        return false;
2200}
2201EXPORT_SYMBOL_GPL(is_skb_forwardable);
2202
2203int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2204{
2205        int ret = ____dev_forward_skb(dev, skb);
2206
2207        if (likely(!ret)) {
2208                skb->protocol = eth_type_trans(skb, dev);
2209                skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
2210        }
2211
2212        return ret;
2213}
2214EXPORT_SYMBOL_GPL(__dev_forward_skb);
2215
2216/**
2217 * dev_forward_skb - loopback an skb to another netif
2218 *
2219 * @dev: destination network device
2220 * @skb: buffer to forward
2221 *
2222 * return values:
2223 *      NET_RX_SUCCESS  (no congestion)
2224 *      NET_RX_DROP     (packet was dropped, but freed)
2225 *
2226 * dev_forward_skb can be used for injecting an skb from the
2227 * start_xmit function of one device into the receive queue
2228 * of another device.
2229 *
2230 * The receiving device may be in another namespace, so
2231 * we have to clear all information in the skb that could
2232 * impact namespace isolation.
2233 */
2234int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2235{
2236        return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
2237}
2238EXPORT_SYMBOL_GPL(dev_forward_skb);
2239
2240static inline int deliver_skb(struct sk_buff *skb,
2241                              struct packet_type *pt_prev,
2242                              struct net_device *orig_dev)
2243{
2244        if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
2245                return -ENOMEM;
2246        refcount_inc(&skb->users);
2247        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2248}
2249
2250static inline void deliver_ptype_list_skb(struct sk_buff *skb,
2251                                          struct packet_type **pt,
2252                                          struct net_device *orig_dev,
2253                                          __be16 type,
2254                                          struct list_head *ptype_list)
2255{
2256        struct packet_type *ptype, *pt_prev = *pt;
2257
2258        list_for_each_entry_rcu(ptype, ptype_list, list) {
2259                if (ptype->type != type)
2260                        continue;
2261                if (pt_prev)
2262                        deliver_skb(skb, pt_prev, orig_dev);
2263                pt_prev = ptype;
2264        }
2265        *pt = pt_prev;
2266}
2267
2268static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
2269{
2270        if (!ptype->af_packet_priv || !skb->sk)
2271                return false;
2272
2273        if (ptype->id_match)
2274                return ptype->id_match(ptype, skb->sk);
2275        else if ((struct sock *)ptype->af_packet_priv == skb->sk)
2276                return true;
2277
2278        return false;
2279}
2280
2281/**
2282 * dev_nit_active - return true if any network interface taps are in use
2283 *
2284 * @dev: network device to check for the presence of taps
2285 */
2286bool dev_nit_active(struct net_device *dev)
2287{
2288        return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
2289}
2290EXPORT_SYMBOL_GPL(dev_nit_active);
2291
2292/*
2293 *      Support routine. Sends outgoing frames to any network
2294 *      taps currently in use.
2295 */
2296
2297void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
2298{
2299        struct packet_type *ptype;
2300        struct sk_buff *skb2 = NULL;
2301        struct packet_type *pt_prev = NULL;
2302        struct list_head *ptype_list = &ptype_all;
2303
2304        rcu_read_lock();
2305again:
2306        list_for_each_entry_rcu(ptype, ptype_list, list) {
2307                if (ptype->ignore_outgoing)
2308                        continue;
2309
2310                /* Never send packets back to the socket
2311                 * they originated from - MvS (miquels@drinkel.ow.org)
2312                 */
2313                if (skb_loop_sk(ptype, skb))
2314                        continue;
2315
2316                if (pt_prev) {
2317                        deliver_skb(skb2, pt_prev, skb->dev);
2318                        pt_prev = ptype;
2319                        continue;
2320                }
2321
2322                /* need to clone skb, done only once */
2323                skb2 = skb_clone(skb, GFP_ATOMIC);
2324                if (!skb2)
2325                        goto out_unlock;
2326
2327                net_timestamp_set(skb2);
2328
2329                /* skb->nh should be correctly
2330                 * set by sender, so that the second statement is
2331                 * just protection against buggy protocols.
2332                 */
2333                skb_reset_mac_header(skb2);
2334
2335                if (skb_network_header(skb2) < skb2->data ||
2336                    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2337                        net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2338                                             ntohs(skb2->protocol),
2339                                             dev->name);
2340                        skb_reset_network_header(skb2);
2341                }
2342
2343                skb2->transport_header = skb2->network_header;
2344                skb2->pkt_type = PACKET_OUTGOING;
2345                pt_prev = ptype;
2346        }
2347
2348        if (ptype_list == &ptype_all) {
2349                ptype_list = &dev->ptype_all;
2350                goto again;
2351        }
2352out_unlock:
2353        if (pt_prev) {
2354                if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2355                        pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2356                else
2357                        kfree_skb(skb2);
2358        }
2359        rcu_read_unlock();
2360}
2361EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2362
2363/**
2364 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2365 * @dev: Network device
2366 * @txq: number of queues available
2367 *
2368 * If real_num_tx_queues is changed the tc mappings may no longer be
2369 * valid. To resolve this verify the tc mapping remains valid and if
2370 * not NULL the mapping. With no priorities mapping to this
2371 * offset/count pair it will no longer be used. In the worst case TC0
2372 * is invalid nothing can be done so disable priority mappings. If is
2373 * expected that drivers will fix this mapping if they can before
2374 * calling netif_set_real_num_tx_queues.
2375 */
2376static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2377{
2378        int i;
2379        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2380
2381        /* If TC0 is invalidated disable TC mapping */
2382        if (tc->offset + tc->count > txq) {
2383                pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2384                dev->num_tc = 0;
2385                return;
2386        }
2387
2388        /* Invalidated prio to tc mappings set to TC0 */
2389        for (i = 1; i < TC_BITMASK + 1; i++) {
2390                int q = netdev_get_prio_tc_map(dev, i);
2391
2392                tc = &dev->tc_to_txq[q];
2393                if (tc->offset + tc->count > txq) {
2394                        pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2395                                i, q);
2396                        netdev_set_prio_tc_map(dev, i, 0);
2397                }
2398        }
2399}
2400
2401int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2402{
2403        if (dev->num_tc) {
2404                struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2405                int i;
2406
2407                /* walk through the TCs and see if it falls into any of them */
2408                for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2409                        if ((txq - tc->offset) < tc->count)
2410                                return i;
2411                }
2412
2413                /* didn't find it, just return -1 to indicate no match */
2414                return -1;
2415        }
2416
2417        return 0;
2418}
2419EXPORT_SYMBOL(netdev_txq_to_tc);
2420
2421#ifdef CONFIG_XPS
2422struct static_key xps_needed __read_mostly;
2423EXPORT_SYMBOL(xps_needed);
2424struct static_key xps_rxqs_needed __read_mostly;
2425EXPORT_SYMBOL(xps_rxqs_needed);
2426static DEFINE_MUTEX(xps_map_mutex);
2427#define xmap_dereference(P)             \
2428        rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2429
2430static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2431                             int tci, u16 index)
2432{
2433        struct xps_map *map = NULL;
2434        int pos;
2435
2436        if (dev_maps)
2437                map = xmap_dereference(dev_maps->attr_map[tci]);
2438        if (!map)
2439                return false;
2440
2441        for (pos = map->len; pos--;) {
2442                if (map->queues[pos] != index)
2443                        continue;
2444
2445                if (map->len > 1) {
2446                        map->queues[pos] = map->queues[--map->len];
2447                        break;
2448                }
2449
2450                RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2451                kfree_rcu(map, rcu);
2452                return false;
2453        }
2454
2455        return true;
2456}
2457
2458static bool remove_xps_queue_cpu(struct net_device *dev,
2459                                 struct xps_dev_maps *dev_maps,
2460                                 int cpu, u16 offset, u16 count)
2461{
2462        int num_tc = dev->num_tc ? : 1;
2463        bool active = false;
2464        int tci;
2465
2466        for (tci = cpu * num_tc; num_tc--; tci++) {
2467                int i, j;
2468
2469                for (i = count, j = offset; i--; j++) {
2470                        if (!remove_xps_queue(dev_maps, tci, j))
2471                                break;
2472                }
2473
2474                active |= i < 0;
2475        }
2476
2477        return active;
2478}
2479
2480static void reset_xps_maps(struct net_device *dev,
2481                           struct xps_dev_maps *dev_maps,
2482                           bool is_rxqs_map)
2483{
2484        if (is_rxqs_map) {
2485                static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2486                RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
2487        } else {
2488                RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
2489        }
2490        static_key_slow_dec_cpuslocked(&xps_needed);
2491        kfree_rcu(dev_maps, rcu);
2492}
2493
2494static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
2495                           struct xps_dev_maps *dev_maps, unsigned int nr_ids,
2496                           u16 offset, u16 count, bool is_rxqs_map)
2497{
2498        bool active = false;
2499        int i, j;
2500
2501        for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
2502             j < nr_ids;)
2503                active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
2504                                               count);
2505        if (!active)
2506                reset_xps_maps(dev, dev_maps, is_rxqs_map);
2507
2508        if (!is_rxqs_map) {
2509                for (i = offset + (count - 1); count--; i--) {
2510                        netdev_queue_numa_node_write(
2511                                netdev_get_tx_queue(dev, i),
2512                                NUMA_NO_NODE);
2513                }
2514        }
2515}
2516
2517static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2518                                   u16 count)
2519{
2520        const unsigned long *possible_mask = NULL;
2521        struct xps_dev_maps *dev_maps;
2522        unsigned int nr_ids;
2523
2524        if (!static_key_false(&xps_needed))
2525                return;
2526
2527        cpus_read_lock();
2528        mutex_lock(&xps_map_mutex);
2529
2530        if (static_key_false(&xps_rxqs_needed)) {
2531                dev_maps = xmap_dereference(dev->xps_rxqs_map);
2532                if (dev_maps) {
2533                        nr_ids = dev->num_rx_queues;
2534                        clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
2535                                       offset, count, true);
2536                }
2537        }
2538
2539        dev_maps = xmap_dereference(dev->xps_cpus_map);
2540        if (!dev_maps)
2541                goto out_no_maps;
2542
2543        if (num_possible_cpus() > 1)
2544                possible_mask = cpumask_bits(cpu_possible_mask);
2545        nr_ids = nr_cpu_ids;
2546        clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
2547                       false);
2548
2549out_no_maps:
2550        mutex_unlock(&xps_map_mutex);
2551        cpus_read_unlock();
2552}
2553
2554static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2555{
2556        netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2557}
2558
2559static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2560                                      u16 index, bool is_rxqs_map)
2561{
2562        struct xps_map *new_map;
2563        int alloc_len = XPS_MIN_MAP_ALLOC;
2564        int i, pos;
2565
2566        for (pos = 0; map && pos < map->len; pos++) {
2567                if (map->queues[pos] != index)
2568                        continue;
2569                return map;
2570        }
2571
2572        /* Need to add tx-queue to this CPU's/rx-queue's existing map */
2573        if (map) {
2574                if (pos < map->alloc_len)
2575                        return map;
2576
2577                alloc_len = map->alloc_len * 2;
2578        }
2579
2580        /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2581         *  map
2582         */
2583        if (is_rxqs_map)
2584                new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2585        else
2586                new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2587                                       cpu_to_node(attr_index));
2588        if (!new_map)
2589                return NULL;
2590
2591        for (i = 0; i < pos; i++)
2592                new_map->queues[i] = map->queues[i];
2593        new_map->alloc_len = alloc_len;
2594        new_map->len = pos;
2595
2596        return new_map;
2597}
2598
2599/* Must be called under cpus_read_lock */
2600int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2601                          u16 index, bool is_rxqs_map)
2602{
2603        const unsigned long *online_mask = NULL, *possible_mask = NULL;
2604        struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2605        int i, j, tci, numa_node_id = -2;
2606        int maps_sz, num_tc = 1, tc = 0;
2607        struct xps_map *map, *new_map;
2608        bool active = false;
2609        unsigned int nr_ids;
2610
2611        if (dev->num_tc) {
2612                /* Do not allow XPS on subordinate device directly */
2613                num_tc = dev->num_tc;
2614                if (num_tc < 0)
2615                        return -EINVAL;
2616
2617                /* If queue belongs to subordinate dev use its map */
2618                dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2619
2620                tc = netdev_txq_to_tc(dev, index);
2621                if (tc < 0)
2622                        return -EINVAL;
2623        }
2624
2625        mutex_lock(&xps_map_mutex);
2626        if (is_rxqs_map) {
2627                maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2628                dev_maps = xmap_dereference(dev->xps_rxqs_map);
2629                nr_ids = dev->num_rx_queues;
2630        } else {
2631                maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2632                if (num_possible_cpus() > 1) {
2633                        online_mask = cpumask_bits(cpu_online_mask);
2634                        possible_mask = cpumask_bits(cpu_possible_mask);
2635                }
2636                dev_maps = xmap_dereference(dev->xps_cpus_map);
2637                nr_ids = nr_cpu_ids;
2638        }
2639
2640        if (maps_sz < L1_CACHE_BYTES)
2641                maps_sz = L1_CACHE_BYTES;
2642
2643        /* allocate memory for queue storage */
2644        for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2645             j < nr_ids;) {
2646                if (!new_dev_maps)
2647                        new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2648                if (!new_dev_maps) {
2649                        mutex_unlock(&xps_map_mutex);
2650                        return -ENOMEM;
2651                }
2652
2653                tci = j * num_tc + tc;
2654                map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
2655                                 NULL;
2656
2657                map = expand_xps_map(map, j, index, is_rxqs_map);
2658                if (!map)
2659                        goto error;
2660
2661                RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2662        }
2663
2664        if (!new_dev_maps)
2665                goto out_no_new_maps;
2666
2667        if (!dev_maps) {
2668                /* Increment static keys at most once per type */
2669                static_key_slow_inc_cpuslocked(&xps_needed);
2670                if (is_rxqs_map)
2671                        static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2672        }
2673
2674        for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2675             j < nr_ids;) {
2676                /* copy maps belonging to foreign traffic classes */
2677                for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
2678                        /* fill in the new device map from the old device map */
2679                        map = xmap_dereference(dev_maps->attr_map[tci]);
2680                        RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2681                }
2682
2683                /* We need to explicitly update tci as prevous loop
2684                 * could break out early if dev_maps is NULL.
2685                 */
2686                tci = j * num_tc + tc;
2687
2688                if (netif_attr_test_mask(j, mask, nr_ids) &&
2689                    netif_attr_test_online(j, online_mask, nr_ids)) {
2690                        /* add tx-queue to CPU/rx-queue maps */
2691                        int pos = 0;
2692
2693                        map = xmap_dereference(new_dev_maps->attr_map[tci]);
2694                        while ((pos < map->len) && (map->queues[pos] != index))
2695                                pos++;
2696
2697                        if (pos == map->len)
2698                                map->queues[map->len++] = index;
2699#ifdef CONFIG_NUMA
2700                        if (!is_rxqs_map) {
2701                                if (numa_node_id == -2)
2702                                        numa_node_id = cpu_to_node(j);
2703                                else if (numa_node_id != cpu_to_node(j))
2704                                        numa_node_id = -1;
2705                        }
2706#endif
2707                } else if (dev_maps) {
2708                        /* fill in the new device map from the old device map */
2709                        map = xmap_dereference(dev_maps->attr_map[tci]);
2710                        RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2711                }
2712
2713                /* copy maps belonging to foreign traffic classes */
2714                for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2715                        /* fill in the new device map from the old device map */
2716                        map = xmap_dereference(dev_maps->attr_map[tci]);
2717                        RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2718                }
2719        }
2720
2721        if (is_rxqs_map)
2722                rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
2723        else
2724                rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
2725
2726        /* Cleanup old maps */
2727        if (!dev_maps)
2728                goto out_no_old_maps;
2729
2730        for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2731             j < nr_ids;) {
2732                for (i = num_tc, tci = j * num_tc; i--; tci++) {
2733                        new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2734                        map = xmap_dereference(dev_maps->attr_map[tci]);
2735                        if (map && map != new_map)
2736                                kfree_rcu(map, rcu);
2737                }
2738        }
2739
2740        kfree_rcu(dev_maps, rcu);
2741
2742out_no_old_maps:
2743        dev_maps = new_dev_maps;
2744        active = true;
2745
2746out_no_new_maps:
2747        if (!is_rxqs_map) {
2748                /* update Tx queue numa node */
2749                netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2750                                             (numa_node_id >= 0) ?
2751                                             numa_node_id : NUMA_NO_NODE);
2752        }
2753
2754        if (!dev_maps)
2755                goto out_no_maps;
2756
2757        /* removes tx-queue from unused CPUs/rx-queues */
2758        for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2759             j < nr_ids;) {
2760                for (i = tc, tci = j * num_tc; i--; tci++)
2761                        active |= remove_xps_queue(dev_maps, tci, index);
2762                if (!netif_attr_test_mask(j, mask, nr_ids) ||
2763                    !netif_attr_test_online(j, online_mask, nr_ids))
2764                        active |= remove_xps_queue(dev_maps, tci, index);
2765                for (i = num_tc - tc, tci++; --i; tci++)
2766                        active |= remove_xps_queue(dev_maps, tci, index);
2767        }
2768
2769        /* free map if not active */
2770        if (!active)
2771                reset_xps_maps(dev, dev_maps, is_rxqs_map);
2772
2773out_no_maps:
2774        mutex_unlock(&xps_map_mutex);
2775
2776        return 0;
2777error:
2778        /* remove any maps that we added */
2779        for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2780             j < nr_ids;) {
2781                for (i = num_tc, tci = j * num_tc; i--; tci++) {
2782                        new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2783                        map = dev_maps ?
2784                              xmap_dereference(dev_maps->attr_map[tci]) :
2785                              NULL;
2786                        if (new_map && new_map != map)
2787                                kfree(new_map);
2788                }
2789        }
2790
2791        mutex_unlock(&xps_map_mutex);
2792
2793        kfree(new_dev_maps);
2794        return -ENOMEM;
2795}
2796EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2797
2798int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2799                        u16 index)
2800{
2801        int ret;
2802
2803        cpus_read_lock();
2804        ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
2805        cpus_read_unlock();
2806
2807        return ret;
2808}
2809EXPORT_SYMBOL(netif_set_xps_queue);
2810
2811#endif
2812static void netdev_unbind_all_sb_channels(struct net_device *dev)
2813{
2814        struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2815
2816        /* Unbind any subordinate channels */
2817        while (txq-- != &dev->_tx[0]) {
2818                if (txq->sb_dev)
2819                        netdev_unbind_sb_channel(dev, txq->sb_dev);
2820        }
2821}
2822
2823void netdev_reset_tc(struct net_device *dev)
2824{
2825#ifdef CONFIG_XPS
2826        netif_reset_xps_queues_gt(dev, 0);
2827#endif
2828        netdev_unbind_all_sb_channels(dev);
2829
2830        /* Reset TC configuration of device */
2831        dev->num_tc = 0;
2832        memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2833        memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2834}
2835EXPORT_SYMBOL(netdev_reset_tc);
2836
2837int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2838{
2839        if (tc >= dev->num_tc)
2840                return -EINVAL;
2841
2842#ifdef CONFIG_XPS
2843        netif_reset_xps_queues(dev, offset, count);
2844#endif
2845        dev->tc_to_txq[tc].count = count;
2846        dev->tc_to_txq[tc].offset = offset;
2847        return 0;
2848}
2849EXPORT_SYMBOL(netdev_set_tc_queue);
2850
2851int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2852{
2853        if (num_tc > TC_MAX_QUEUE)
2854                return -EINVAL;
2855
2856#ifdef CONFIG_XPS
2857        netif_reset_xps_queues_gt(dev, 0);
2858#endif
2859        netdev_unbind_all_sb_channels(dev);
2860
2861        dev->num_tc = num_tc;
2862        return 0;
2863}
2864EXPORT_SYMBOL(netdev_set_num_tc);
2865
2866void netdev_unbind_sb_channel(struct net_device *dev,
2867                              struct net_device *sb_dev)
2868{
2869        struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2870
2871#ifdef CONFIG_XPS
2872        netif_reset_xps_queues_gt(sb_dev, 0);
2873#endif
2874        memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
2875        memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
2876
2877        while (txq-- != &dev->_tx[0]) {
2878                if (txq->sb_dev == sb_dev)
2879                        txq->sb_dev = NULL;
2880        }
2881}
2882EXPORT_SYMBOL(netdev_unbind_sb_channel);
2883
2884int netdev_bind_sb_channel_queue(struct net_device *dev,
2885                                 struct net_device *sb_dev,
2886                                 u8 tc, u16 count, u16 offset)
2887{
2888        /* Make certain the sb_dev and dev are already configured */
2889        if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
2890                return -EINVAL;
2891
2892        /* We cannot hand out queues we don't have */
2893        if ((offset + count) > dev->real_num_tx_queues)
2894                return -EINVAL;
2895
2896        /* Record the mapping */
2897        sb_dev->tc_to_txq[tc].count = count;
2898        sb_dev->tc_to_txq[tc].offset = offset;
2899
2900        /* Provide a way for Tx queue to find the tc_to_txq map or
2901         * XPS map for itself.
2902         */
2903        while (count--)
2904                netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
2905
2906        return 0;
2907}
2908EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
2909
2910int netdev_set_sb_channel(struct net_device *dev, u16 channel)
2911{
2912        /* Do not use a multiqueue device to represent a subordinate channel */
2913        if (netif_is_multiqueue(dev))
2914                return -ENODEV;
2915
2916        /* We allow channels 1 - 32767 to be used for subordinate channels.
2917         * Channel 0 is meant to be "native" mode and used only to represent
2918         * the main root device. We allow writing 0 to reset the device back
2919         * to normal mode after being used as a subordinate channel.
2920         */
2921        if (channel > S16_MAX)
2922                return -EINVAL;
2923
2924        dev->num_tc = -channel;
2925
2926        return 0;
2927}
2928EXPORT_SYMBOL(netdev_set_sb_channel);
2929
2930/*
2931 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2932 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2933 */
2934int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2935{
2936        bool disabling;
2937        int rc;
2938
2939        disabling = txq < dev->real_num_tx_queues;
2940
2941        if (txq < 1 || txq > dev->num_tx_queues)
2942                return -EINVAL;
2943
2944        if (dev->reg_state == NETREG_REGISTERED ||
2945            dev->reg_state == NETREG_UNREGISTERING) {
2946                ASSERT_RTNL();
2947
2948                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2949                                                  txq);
2950                if (rc)
2951                        return rc;
2952
2953                if (dev->num_tc)
2954                        netif_setup_tc(dev, txq);
2955
2956                dev->real_num_tx_queues = txq;
2957
2958                if (disabling) {
2959                        synchronize_net();
2960                        qdisc_reset_all_tx_gt(dev, txq);
2961#ifdef CONFIG_XPS
2962                        netif_reset_xps_queues_gt(dev, txq);
2963#endif
2964                }
2965        } else {
2966                dev->real_num_tx_queues = txq;
2967        }
2968
2969        return 0;
2970}
2971EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2972
2973#ifdef CONFIG_SYSFS
2974/**
2975 *      netif_set_real_num_rx_queues - set actual number of RX queues used
2976 *      @dev: Network device
2977 *      @rxq: Actual number of RX queues
2978 *
2979 *      This must be called either with the rtnl_lock held or before
2980 *      registration of the net device.  Returns 0 on success, or a
2981 *      negative error code.  If called before registration, it always
2982 *      succeeds.
2983 */
2984int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2985{
2986        int rc;
2987
2988        if (rxq < 1 || rxq > dev->num_rx_queues)
2989                return -EINVAL;
2990
2991        if (dev->reg_state == NETREG_REGISTERED) {
2992                ASSERT_RTNL();
2993
2994                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2995                                                  rxq);
2996                if (rc)
2997                        return rc;
2998        }
2999
3000        dev->real_num_rx_queues = rxq;

3001        return 0;
3002}
3003EXPORT_SYMBOL(netif_set_real_num_rx_queues);
3004#endif
3005
3006/**
3007 * netif_get_num_default_rss_queues - default number of RSS queues
3008 *
3009 * This routine should set an upper limit on the number of RSS queues
3010 * used by default by multiqueue devices.
3011 */
3012int netif_get_num_default_rss_queues(void)
3013{
3014        return is_kdump_kernel() ?
3015                1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
3016}
3017EXPORT_SYMBOL(netif_get_num_default_rss_queues);
3018
3019static void __netif_reschedule(struct Qdisc *q)
3020{
3021        struct softnet_data *sd;
3022        unsigned long flags;
3023
3024        local_irq_save(flags);
3025        sd = this_cpu_ptr(&softnet_data);
3026        q->next_sched = NULL;
3027        *sd->output_queue_tailp = q;
3028        sd->output_queue_tailp = &q->next_sched;
3029        raise_softirq_irqoff(NET_TX_SOFTIRQ);
3030        local_irq_restore(flags);
3031}
3032
3033void __netif_schedule(struct Qdisc *q)
3034{
3035        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
3036                __netif_reschedule(q);
3037}
3038EXPORT_SYMBOL(__netif_schedule);
3039
3040struct dev_kfree_skb_cb {
3041        enum skb_free_reason reason;
3042};
3043
3044static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
3045{
3046        return (struct dev_kfree_skb_cb *)skb->cb;
3047}
3048
3049void netif_schedule_queue(struct netdev_queue *txq)
3050{
3051        rcu_read_lock();
3052        if (!netif_xmit_stopped(txq)) {
3053                struct Qdisc *q = rcu_dereference(txq->qdisc);
3054
3055                __netif_schedule(q);
3056        }
3057        rcu_read_unlock();
3058}
3059EXPORT_SYMBOL(netif_schedule_queue);
3060
3061void netif_tx_wake_queue(struct netdev_queue *dev_queue)
3062{
3063        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
3064                struct Qdisc *q;
3065
3066                rcu_read_lock();
3067                q = rcu_dereference(dev_queue->qdisc);
3068                __netif_schedule(q);
3069                rcu_read_unlock();
3070        }
3071}
3072EXPORT_SYMBOL(netif_tx_wake_queue);
3073
3074void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
3075{
3076        unsigned long flags;
3077
3078        if (unlikely(!skb))
3079                return;
3080
3081        if (likely(refcount_read(&skb->users) == 1)) {
3082                smp_rmb();
3083                refcount_set(&skb->users, 0);
3084        } else if (likely(!refcount_dec_and_test(&skb->users))) {
3085                return;
3086        }
3087        get_kfree_skb_cb(skb)->reason = reason;
3088        local_irq_save(flags);
3089        skb->next = __this_cpu_read(softnet_data.completion_queue);
3090        __this_cpu_write(softnet_data.completion_queue, skb);
3091        raise_softirq_irqoff(NET_TX_SOFTIRQ);
3092        local_irq_restore(flags);
3093}
3094EXPORT_SYMBOL(__dev_kfree_skb_irq);
3095
3096void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
3097{
3098        if (in_irq() || irqs_disabled())
3099                __dev_kfree_skb_irq(skb, reason);
3100        else
3101                dev_kfree_skb(skb);
3102}
3103EXPORT_SYMBOL(__dev_kfree_skb_any);
3104
3105
3106/**
3107 * netif_device_detach - mark device as removed
3108 * @dev: network device
3109 *
3110 * Mark device as removed from system and therefore no longer available.
3111 */
3112void netif_device_detach(struct net_device *dev)
3113{
3114        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
3115            netif_running(dev)) {
3116                netif_tx_stop_all_queues(dev);
3117        }
3118}
3119EXPORT_SYMBOL(netif_device_detach);
3120
3121/**
3122 * netif_device_attach - mark device as attached
3123 * @dev: network device
3124 *
3125 * Mark device as attached from system and restart if needed.
3126 */
3127void netif_device_attach(struct net_device *dev)
3128{
3129        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
3130            netif_running(dev)) {
3131                netif_tx_wake_all_queues(dev);
3132                __netdev_watchdog_up(dev);
3133        }
3134}
3135EXPORT_SYMBOL(netif_device_attach);
3136
3137/*
3138 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
3139 * to be used as a distribution range.
3140 */
3141static u16 skb_tx_hash(const struct net_device *dev,
3142                       const struct net_device *sb_dev,
3143                       struct sk_buff *skb)
3144{
3145        u32 hash;
3146        u16 qoffset = 0;
3147        u16 qcount = dev->real_num_tx_queues;
3148
3149        if (dev->num_tc) {
3150                u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
3151
3152                qoffset = sb_dev->tc_to_txq[tc].offset;
3153                qcount = sb_dev->tc_to_txq[tc].count;
3154        }
3155
3156        if (skb_rx_queue_recorded(skb)) {
3157                hash = skb_get_rx_queue(skb);
3158                if (hash >= qoffset)
3159                        hash -= qoffset;
3160                while (unlikely(hash >= qcount))
3161                        hash -= qcount;
3162                return hash + qoffset;
3163        }
3164
3165        return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
3166}
3167
3168static void skb_warn_bad_offload(const struct sk_buff *skb)
3169{
3170        static const netdev_features_t null_features;
3171        struct net_device *dev = skb->dev;
3172        const char *name = "";
3173
3174        if (!net_ratelimit())
3175                return;
3176
3177        if (dev) {
3178                if (dev->dev.parent)
3179                        name = dev_driver_string(dev->dev.parent);
3180                else
3181                        name = netdev_name(dev);
3182        }
3183        skb_dump(KERN_WARNING, skb, false);
3184        WARN(1, "%s: caps=(%pNF, %pNF)\n",
3185             name, dev ? &dev->features : &null_features,
3186             skb->sk ? &skb->sk->sk_route_caps : &null_features);
3187}
3188
3189/*
3190 * Invalidate hardware checksum when packet is to be mangled, and
3191 * complete checksum manually on outgoing path.
3192 */
3193int skb_checksum_help(struct sk_buff *skb)
3194{
3195        __wsum csum;
3196        int ret = 0, offset;
3197
3198        if (skb->ip_summed == CHECKSUM_COMPLETE)
3199                goto out_set_summed;
3200
3201        if (unlikely(skb_shinfo(skb)->gso_size)) {
3202                skb_warn_bad_offload(skb);
3203                return -EINVAL;
3204        }
3205
3206        /* Before computing a checksum, we should make sure no frag could
3207         * be modified by an external entity : checksum could be wrong.
3208         */
3209        if (skb_has_shared_frag(skb)) {
3210                ret = __skb_linearize(skb);
3211                if (ret)
3212                        goto out;
3213        }
3214
3215        offset = skb_checksum_start_offset(skb);
3216        BUG_ON(offset >= skb_headlen(skb));
3217        csum = skb_checksum(skb, offset, skb->len - offset, 0);
3218
3219        offset += skb->csum_offset;
3220        BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
3221
3222        ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
3223        if (ret)
3224                goto out;
3225
3226        *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
3227out_set_summed:
3228        skb->ip_summed = CHECKSUM_NONE;
3229out:
3230        return ret;
3231}
3232EXPORT_SYMBOL(skb_checksum_help);
3233
3234int skb_crc32c_csum_help(struct sk_buff *skb)
3235{
3236        __le32 crc32c_csum;
3237        int ret = 0, offset, start;
3238
3239        if (skb->ip_summed != CHECKSUM_PARTIAL)
3240                goto out;
3241
3242        if (unlikely(skb_is_gso(skb)))
3243                goto out;
3244
3245        /* Before computing a checksum, we should make sure no frag could
3246         * be modified by an external entity : checksum could be wrong.
3247         */
3248        if (unlikely(skb_has_shared_frag(skb))) {
3249                ret = __skb_linearize(skb);
3250                if (ret)
3251                        goto out;
3252        }
3253        start = skb_checksum_start_offset(skb);
3254        offset = start + offsetof(struct sctphdr, checksum);
3255        if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
3256                ret = -EINVAL;
3257                goto out;
3258        }
3259
3260        ret = skb_ensure_writable(skb, offset + sizeof(__le32));
3261        if (ret)
3262                goto out;
3263
3264        crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
3265                                                  skb->len - start, ~(__u32)0,
3266                                                  crc32c_csum_stub));
3267        *(__le32 *)(skb->data + offset) = crc32c_csum;
3268        skb->ip_summed = CHECKSUM_NONE;
3269        skb->csum_not_inet = 0;
3270out:
3271        return ret;
3272}
3273
3274__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
3275{
3276        __be16 type = skb->protocol;
3277
3278        /* Tunnel gso handlers can set protocol to ethernet. */
3279        if (type == htons(ETH_P_TEB)) {
3280                struct ethhdr *eth;
3281
3282                if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
3283                        return 0;
3284
3285                eth = (struct ethhdr *)skb->data;
3286                type = eth->h_proto;
3287        }
3288
3289        return __vlan_get_protocol(skb, type, depth);
3290}
3291
3292/**
3293 *      skb_mac_gso_segment - mac layer segmentation handler.
3294 *      @skb: buffer to segment
3295 *      @features: features for the output path (see dev->features)
3296 */
3297struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
3298                                    netdev_features_t features)
3299{
3300        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
3301        struct packet_offload *ptype;
3302        int vlan_depth = skb->mac_len;
3303        __be16 type = skb_network_protocol(skb, &vlan_depth);
3304
3305        if (unlikely(!type))
3306                return ERR_PTR(-EINVAL);
3307
3308        __skb_pull(skb, vlan_depth);
3309
3310        rcu_read_lock();
3311        list_for_each_entry_rcu(ptype, &offload_base, list) {
3312                if (ptype->type == type && ptype->callbacks.gso_segment) {
3313                        segs = ptype->callbacks.gso_segment(skb, features);
3314                        break;
3315                }
3316        }
3317        rcu_read_unlock();
3318
3319        __skb_push(skb, skb->data - skb_mac_header(skb));
3320
3321        return segs;
3322}
3323EXPORT_SYMBOL(skb_mac_gso_segment);
3324
3325
3326/* openvswitch calls this on rx path, so we need a different check.
3327 */
3328static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
3329{
3330        if (tx_path)
3331                return skb->ip_summed != CHECKSUM_PARTIAL &&
3332                       skb->ip_summed != CHECKSUM_UNNECESSARY;
3333
3334        return skb->ip_summed == CHECKSUM_NONE;
3335}
3336
3337/**
3338 *      __skb_gso_segment - Perform segmentation on skb.
3339 *      @skb: buffer to segment
3340 *      @features: features for the output path (see dev->features)
3341 *      @tx_path: whether it is called in TX path
3342 *
3343 *      This function segments the given skb and returns a list of segments.
3344 *
3345 *      It may return NULL if the skb requires no segmentation.  This is
3346 *      only possible when GSO is used for verifying header integrity.
3347 *
3348 *      Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
3349 */
3350struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
3351                                  netdev_features_t features, bool tx_path)
3352{
3353        struct sk_buff *segs;
3354
3355        if (unlikely(skb_needs_check(skb, tx_path))) {
3356                int err;
3357
3358                /* We're going to init ->check field in TCP or UDP header */
3359                err = skb_cow_head(skb, 0);
3360                if (err < 0)
3361                        return ERR_PTR(err);
3362        }
3363
3364        /* Only report GSO partial support if it will enable us to
3365         * support segmentation on this frame without needing additional
3366         * work.
3367         */
3368        if (features & NETIF_F_GSO_PARTIAL) {
3369                netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
3370                struct net_device *dev = skb->dev;
3371
3372                partial_features |= dev->features & dev->gso_partial_features;
3373                if (!skb_gso_ok(skb, features | partial_features))
3374                        features &= ~NETIF_F_GSO_PARTIAL;
3375        }
3376
3377        BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
3378                     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
3379
3380        SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
3381        SKB_GSO_CB(skb)->encap_level = 0;
3382
3383        skb_reset_mac_header(skb);
3384        skb_reset_mac_len(skb);
3385
3386        segs = skb_mac_gso_segment(skb, features);
3387
3388        if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
3389                skb_warn_bad_offload(skb);
3390
3391        return segs;
3392}
3393EXPORT_SYMBOL(__skb_gso_segment);
3394
3395/* Take action when hardware reception checksum errors are detected. */
3396#ifdef CONFIG_BUG
3397void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3398{
3399        if (net_ratelimit()) {
3400                pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
3401                skb_dump(KERN_ERR, skb, true);
3402                dump_stack();
3403        }
3404}
3405EXPORT_SYMBOL(netdev_rx_csum_fault);
3406#endif
3407
3408/* XXX: check that highmem exists at all on the given machine. */
3409static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3410{
3411#ifdef CONFIG_HIGHMEM
3412        int i;
3413
3414        if (!(dev->features & NETIF_F_HIGHDMA)) {
3415                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3416                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3417
3418                        if (PageHighMem(skb_frag_page(frag)))
3419                                return 1;
3420                }
3421        }
3422#endif
3423        return 0;
3424}
3425
3426/* If MPLS offload request, verify we are testing hardware MPLS features
3427 * instead of standard features for the netdev.
3428 */
3429#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3430static netdev_features_t net_mpls_features(struct sk_buff *skb,
3431                                           netdev_features_t features,
3432                                           __be16 type)
3433{
3434        if (eth_p_mpls(type))
3435                features &= skb->dev->mpls_features;
3436
3437        return features;
3438}
3439#else
3440static netdev_features_t net_mpls_features(struct sk_buff *skb,
3441                                           netdev_features_t features,
3442                                           __be16 type)
3443{
3444        return features;
3445}
3446#endif
3447
3448static netdev_features_t harmonize_features(struct sk_buff *skb,
3449        netdev_features_t features)
3450{
3451        int tmp;
3452        __be16 type;
3453
3454        type = skb_network_protocol(skb, &tmp);
3455        features = net_mpls_features(skb, features, type);
3456
3457        if (skb->ip_summed != CHECKSUM_NONE &&
3458            !can_checksum_protocol(features, type)) {
3459                features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3460        }
3461        if (illegal_highdma(skb->dev, skb))
3462                features &= ~NETIF_F_SG;
3463
3464        return features;
3465}
3466
3467netdev_features_t passthru_features_check(struct sk_buff *skb,
3468                                          struct net_device *dev,
3469                                          netdev_features_t features)
3470{
3471        return features;
3472}
3473EXPORT_SYMBOL(passthru_features_check);
3474
3475static netdev_features_t dflt_features_check(struct sk_buff *skb,
3476                                             struct net_device *dev,
3477                                             netdev_features_t features)
3478{
3479        return vlan_features_check(skb, features);
3480}
3481
3482static netdev_features_t gso_features_check(const struct sk_buff *skb,
3483                                            struct net_device *dev,
3484                                            netdev_features_t features)
3485{
3486        u16 gso_segs = skb_shinfo(skb)->gso_segs;
3487
3488        if (gso_segs > dev->gso_max_segs)
3489                return features & ~NETIF_F_GSO_MASK;
3490
3491        /* Support for GSO partial features requires software
3492         * intervention before we can actually process the packets
3493         * so we need to strip support for any partial features now
3494         * and we can pull them back in after we have partially
3495         * segmented the frame.
3496         */
3497        if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3498                features &= ~dev->gso_partial_features;
3499
3500        /* Make sure to clear the IPv4 ID mangling feature if the
3501         * IPv4 header has the potential to be fragmented.
3502         */
3503        if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3504                struct iphdr *iph = skb->encapsulation ?
3505                                    inner_ip_hdr(skb) : ip_hdr(skb);
3506
3507                if (!(iph->frag_off & htons(IP_DF)))
3508                        features &= ~NETIF_F_TSO_MANGLEID;
3509        }
3510
3511        return features;
3512}
3513
3514netdev_features_t netif_skb_features(struct sk_buff *skb)
3515{
3516        struct net_device *dev = skb->dev;
3517        netdev_features_t features = dev->features;
3518
3519        if (skb_is_gso(skb))
3520                features = gso_features_check(skb, dev, features);
3521
3522        /* If encapsulation offload request, verify we are testing
3523         * hardware encapsulation features instead of standard
3524         * features for the netdev
3525         */
3526        if (skb->encapsulation)
3527                features &= dev->hw_enc_features;
3528
3529        if (skb_vlan_tagged(skb))
3530                features = netdev_intersect_features(features,
3531                                                     dev->vlan_features |
3532                                                     NETIF_F_HW_VLAN_CTAG_TX |
3533                                                     NETIF_F_HW_VLAN_STAG_TX);
3534
3535        if (dev->netdev_ops->ndo_features_check)
3536                features &= dev->netdev_ops->ndo_features_check(skb, dev,
3537                                                                features);
3538        else
3539                features &= dflt_features_check(skb, dev, features);
3540
3541        return harmonize_features(skb, features);
3542}
3543EXPORT_SYMBOL(netif_skb_features);
3544
3545static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3546                    struct netdev_queue *txq, bool more)
3547{
3548        unsigned int len;
3549        int rc;
3550
3551        if (dev_nit_active(dev))
3552                dev_queue_xmit_nit(skb, dev);
3553
3554        len = skb->len;
3555        trace_net_dev_start_xmit(skb, dev);
3556        rc = netdev_start_xmit(skb, dev, txq, more);
3557        trace_net_dev_xmit(skb, rc, dev, len);
3558
3559        return rc;
3560}
3561
3562struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3563                                    struct netdev_queue *txq, int *ret)
3564{
3565        struct sk_buff *skb = first;
3566        int rc = NETDEV_TX_OK;
3567
3568        while (skb) {
3569                struct sk_buff *next = skb->next;
3570
3571                skb_mark_not_on_list(skb);
3572                rc = xmit_one(skb, dev, txq, next != NULL);
3573                if (unlikely(!dev_xmit_complete(rc))) {
3574                        skb->next = next;
3575                        goto out;
3576                }
3577
3578                skb = next;
3579                if (netif_tx_queue_stopped(txq) && skb) {
3580                        rc = NETDEV_TX_BUSY;
3581                        break;
3582                }
3583        }
3584
3585out:
3586        *ret = rc;
3587        return skb;
3588}
3589
3590static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3591                                          netdev_features_t features)
3592{
3593        if (skb_vlan_tag_present(skb) &&
3594            !vlan_hw_offload_capable(features, skb->vlan_proto))
3595                skb = __vlan_hwaccel_push_inside(skb);
3596        return skb;
3597}
3598
3599int skb_csum_hwoffload_help(struct sk_buff *skb,
3600                            const netdev_features_t features)
3601{
3602        if (unlikely(skb->csum_not_inet))
3603                return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3604                        skb_crc32c_csum_help(skb);
3605
3606        return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
3607}
3608EXPORT_SYMBOL(skb_csum_hwoffload_help);
3609
3610static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3611{
3612        netdev_features_t features;
3613
3614        features = netif_skb_features(skb);
3615        skb = validate_xmit_vlan(skb, features);
3616        if (unlikely(!skb))
3617                goto out_null;
3618
3619        skb = sk_validate_xmit_skb(skb, dev);
3620        if (unlikely(!skb))
3621                goto out_null;
3622
3623        if (netif_needs_gso(skb, features)) {
3624                struct sk_buff *segs;
3625
3626                segs = skb_gso_segment(skb, features);
3627                if (IS_ERR(segs)) {
3628                        goto out_kfree_skb;
3629                } else if (segs) {
3630                        consume_skb(skb);
3631                        skb = segs;
3632                }
3633        } else {
3634                if (skb_needs_linearize(skb, features) &&
3635                    __skb_linearize(skb))
3636                        goto out_kfree_skb;
3637
3638                /* If packet is not checksummed and device does not
3639                 * support checksumming for this protocol, complete
3640                 * checksumming here.
3641                 */
3642                if (skb->ip_summed == CHECKSUM_PARTIAL) {
3643                        if (skb->encapsulation)
3644                                skb_set_inner_transport_header(skb,
3645                                                               skb_checksum_start_offset(skb));
3646                        else
3647                                skb_set_transport_header(skb,
3648                                                         skb_checksum_start_offset(skb));
3649                        if (skb_csum_hwoffload_help(skb, features))
3650                                goto out_kfree_skb;
3651                }
3652        }
3653
3654        skb = validate_xmit_xfrm(skb, features, again);
3655
3656        return skb;
3657
3658out_kfree_skb:
3659        kfree_skb(skb);
3660out_null:
3661        atomic_long_inc(&dev->tx_dropped);
3662        return NULL;
3663}
3664
3665struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3666{
3667        struct sk_buff *next, *head = NULL, *tail;
3668
3669        for (; skb != NULL; skb = next) {
3670                next = skb->next;
3671                skb_mark_not_on_list(skb);
3672
3673                /* in case skb wont be segmented, point to itself */
3674                skb->prev = skb;
3675
3676                skb = validate_xmit_skb(skb, dev, again);
3677                if (!skb)
3678                        continue;
3679
3680                if (!head)
3681                        head = skb;
3682                else
3683                        tail->next = skb;
3684                /* If skb was segmented, skb->prev points to
3685                 * the last segment. If not, it still contains skb.
3686                 */
3687                tail = skb->prev;
3688        }
3689        return head;
3690}
3691EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3692
3693static void qdisc_pkt_len_init(struct sk_buff *skb)
3694{
3695        const struct skb_shared_info *shinfo = skb_shinfo(skb);
3696
3697        qdisc_skb_cb(skb)->pkt_len = skb->len;
3698
3699        /* To get more precise estimation of bytes sent on wire,
3700         * we add to pkt_len the headers size of all segments
3701         */
3702        if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
3703                unsigned int hdr_len;
3704                u16 gso_segs = shinfo->gso_segs;
3705
3706                /* mac layer + network layer */
3707                hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3708
3709                /* + transport layer */
3710                if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3711                        const struct tcphdr *th;
3712                        struct tcphdr _tcphdr;
3713
3714                        th = skb_header_pointer(skb, skb_transport_offset(skb),
3715                                                sizeof(_tcphdr), &_tcphdr);
3716                        if (likely(th))
3717                                hdr_len += __tcp_hdrlen(th);
3718                } else {
3719                        struct udphdr _udphdr;
3720
3721                        if (skb_header_pointer(skb, skb_transport_offset(skb),
3722                                               sizeof(_udphdr), &_udphdr))
3723                                hdr_len += sizeof(struct udphdr);
3724                }
3725
3726                if (shinfo->gso_type & SKB_GSO_DODGY)
3727                        gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3728                                                shinfo->gso_size);
3729
3730                qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3731        }
3732}
3733
3734static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3735                                 struct net_device *dev,
3736                                 struct netdev_queue *txq)
3737{
3738        spinlock_t *root_lock = qdisc_lock(q);
3739        struct sk_buff *to_free = NULL;
3740        bool contended;
3741        int rc;
3742
3743        qdisc_calculate_pkt_len(skb, q);
3744
3745        if (q->flags & TCQ_F_NOLOCK) {
3746                rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3747                qdisc_run(q);
3748
3749                if (unlikely(to_free))
3750                        kfree_skb_list(to_free);
3751                return rc;
3752        }
3753
3754        /*
3755         * Heuristic to force contended enqueues to serialize on a
3756         * separate lock before trying to get qdisc main lock.
3757         * This permits qdisc->running owner to get the lock more
3758         * often and dequeue packets faster.
3759         */
3760        contended = qdisc_is_running(q);
3761        if (unlikely(contended))
3762                spin_lock(&q->busylock);
3763
3764        spin_lock(root_lock);
3765        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3766                __qdisc_drop(skb, &to_free);
3767                rc = NET_XMIT_DROP;
3768        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3769                   qdisc_run_begin(q)) {
3770                /*
3771                 * This is a work-conserving queue; there are no old skbs
3772                 * waiting to be sent out; and the qdisc is not running -
3773                 * xmit the skb directly.
3774                 */
3775
3776                qdisc_bstats_update(q, skb);
3777
3778                if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3779                        if (unlikely(contended)) {
3780                                spin_unlock(&q->busylock);
3781                                contended = false;
3782                        }
3783                        __qdisc_run(q);
3784                }
3785
3786                qdisc_run_end(q);
3787                rc = NET_XMIT_SUCCESS;
3788        } else {
3789                rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3790                if (qdisc_run_begin(q)) {
3791                        if (unlikely(contended)) {
3792                                spin_unlock(&q->busylock);
3793                                contended = false;
3794                        }
3795                        __qdisc_run(q);
3796                        qdisc_run_end(q);
3797                }
3798        }
3799        spin_unlock(root_lock);
3800        if (unlikely(to_free))
3801                kfree_skb_list(to_free);
3802        if (unlikely(contended))
3803                spin_unlock(&q->busylock);
3804        return rc;
3805}
3806
3807#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3808static void skb_update_prio(struct sk_buff *skb)
3809{
3810        const struct netprio_map *map;
3811        const struct sock *sk;
3812        unsigned int prioidx;
3813
3814        if (skb->priority)
3815                return;
3816        map = rcu_dereference_bh(skb->dev->priomap);
3817        if (!map)
3818                return;
3819        sk = skb_to_full_sk(skb);
3820        if (!sk)
3821                return;
3822
3823        prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3824
3825        if (prioidx < map->priomap_len)
3826                skb->priority = map->priomap[prioidx];
3827}
3828#else
3829#define skb_update_prio(skb)
3830#endif
3831
3832/**
3833 *      dev_loopback_xmit - loop back @skb
3834 *      @net: network namespace this loopback is happening in
3835 *      @sk:  sk needed to be a netfilter okfn
3836 *      @skb: buffer to transmit
3837 */
3838int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3839{
3840        skb_reset_mac_header(skb);
3841        __skb_pull(skb, skb_network_offset(skb));
3842        skb->pkt_type = PACKET_LOOPBACK;
3843        skb->ip_summed = CHECKSUM_UNNECESSARY;
3844        WARN_ON(!skb_dst(skb));
3845        skb_dst_force(skb);
3846        netif_rx_ni(skb);
3847        return 0;
3848}
3849EXPORT_SYMBOL(dev_loopback_xmit);
3850
3851#ifdef CONFIG_NET_EGRESS
3852static struct sk_buff *
3853sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3854{
3855        struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
3856        struct tcf_result cl_res;
3857
3858        if (!miniq)
3859                return skb;
3860
3861        /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3862        mini_qdisc_bstats_cpu_update(miniq, skb);
3863
3864        switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
3865        case TC_ACT_OK:
3866        case TC_ACT_RECLASSIFY:
3867                skb->tc_index = TC_H_MIN(cl_res.classid);
3868                break;
3869        case TC_ACT_SHOT:
3870                mini_qdisc_qstats_cpu_drop(miniq);
3871                *ret = NET_XMIT_DROP;
3872                kfree_skb(skb);
3873                return NULL;
3874        case TC_ACT_STOLEN:
3875        case TC_ACT_QUEUED:
3876        case TC_ACT_TRAP:
3877                *ret = NET_XMIT_SUCCESS;
3878                consume_skb(skb);
3879                return NULL;
3880        case TC_ACT_REDIRECT:
3881                /* No need to push/pop skb's mac_header here on egress! */
3882                skb_do_redirect(skb);
3883                *ret = NET_XMIT_SUCCESS;
3884                return NULL;
3885        default:
3886                break;
3887        }
3888
3889        return skb;
3890}
3891#endif /* CONFIG_NET_EGRESS */
3892
3893#ifdef CONFIG_XPS
3894static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
3895                               struct xps_dev_maps *dev_maps, unsigned int tci)
3896{
3897        struct xps_map *map;
3898        int queue_index = -1;
3899
3900        if (dev->num_tc) {
3901                tci *= dev->num_tc;
3902                tci += netdev_get_prio_tc_map(dev, skb->priority);
3903        }
3904
3905        map = rcu_dereference(dev_maps->attr_map[tci]);
3906        if (map) {
3907                if (map->len == 1)
3908                        queue_index = map->queues[0];
3909                else
3910                        queue_index = map->queues[reciprocal_scale(
3911                                                skb_get_hash(skb), map->len)];
3912                if (unlikely(queue_index >= dev->real_num_tx_queues))
3913                        queue_index = -1;
3914        }
3915        return queue_index;
3916}
3917#endif
3918
3919static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
3920                         struct sk_buff *skb)
3921{
3922#ifdef CONFIG_XPS
3923        struct xps_dev_maps *dev_maps;
3924        struct sock *sk = skb->sk;
3925        int queue_index = -1;
3926
3927        if (!static_key_false(&xps_needed))
3928                return -1;
3929
3930        rcu_read_lock();
3931        if (!static_key_false(&xps_rxqs_needed))
3932                goto get_cpus_map;
3933
3934        dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
3935        if (dev_maps) {
3936                int tci = sk_rx_queue_get(sk);
3937
3938                if (tci >= 0 && tci < dev->num_rx_queues)
3939                        queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3940                                                          tci);
3941        }
3942
3943get_cpus_map:
3944        if (queue_index < 0) {
3945                dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
3946                if (dev_maps) {
3947                        unsigned int tci = skb->sender_cpu - 1;
3948
3949                        queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3950                                                          tci);
3951                }
3952        }
3953        rcu_read_unlock();
3954
3955        return queue_index;
3956#else
3957        return -1;
3958#endif
3959}
3960
3961u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
3962                     struct net_device *sb_dev)
3963{
3964        return 0;
3965}
3966EXPORT_SYMBOL(dev_pick_tx_zero);
3967
3968u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
3969                       struct net_device *sb_dev)
3970{
3971        return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
3972}
3973EXPORT_SYMBOL(dev_pick_tx_cpu_id);
3974
3975u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
3976                     struct net_device *sb_dev)
3977{
3978        struct sock *sk = skb->sk;
3979        int queue_index = sk_tx_queue_get(sk);
3980
3981        sb_dev = sb_dev ? : dev;
3982
3983        if (queue_index < 0 || skb->ooo_okay ||
3984            queue_index >= dev->real_num_tx_queues) {
3985                int new_index = get_xps_queue(dev, sb_dev, skb);
3986
3987                if (new_index < 0)
3988                        new_index = skb_tx_hash(dev, sb_dev, skb);
3989
3990                if (queue_index != new_index && sk &&
3991                    sk_fullsock(sk) &&
3992                    rcu_access_pointer(sk->sk_dst_cache))
3993                        sk_tx_queue_set(sk, new_index);
3994
3995                queue_index = new_index;
3996        }
3997
3998        return queue_index;
3999}
4000EXPORT_SYMBOL(netdev_pick_tx);

4001
4002struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
4003                                         struct sk_buff *skb,
4004                                         struct net_device *sb_dev)
4005{
4006        int queue_index = 0;
4007
4008#ifdef CONFIG_XPS
4009        u32 sender_cpu = skb->sender_cpu - 1;
4010
4011        if (sender_cpu >= (u32)NR_CPUS)
4012                skb->sender_cpu = raw_smp_processor_id() + 1;
4013#endif
4014
4015        if (dev->real_num_tx_queues != 1) {
4016                const struct net_device_ops *ops = dev->netdev_ops;
4017
4018                if (ops->ndo_select_queue)
4019                        queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
4020                else
4021                        queue_index = netdev_pick_tx(dev, skb, sb_dev);
4022
4023                queue_index = netdev_cap_txqueue(dev, queue_index);
4024        }
4025
4026        skb_set_queue_mapping(skb, queue_index);
4027        return netdev_get_tx_queue(dev, queue_index);
4028}
4029
4030/**
4031 *      __dev_queue_xmit - transmit a buffer
4032 *      @skb: buffer to transmit
4033 *      @sb_dev: suboordinate device used for L2 forwarding offload
4034 *
4035 *      Queue a buffer for transmission to a network device. The caller must
4036 *      have set the device and priority and built the buffer before calling
4037 *      this function. The function can be called from an interrupt.
4038 *
4039 *      A negative errno code is returned on a failure. A success does not
4040 *      guarantee the frame will be transmitted as it may be dropped due
4041 *      to congestion or traffic shaping.
4042 *
4043 * -----------------------------------------------------------------------------------
4044 *      I notice this method can also return errors from the queue disciplines,
4045 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
4046 *      be positive.
4047 *
4048 *      Regardless of the return value, the skb is consumed, so it is currently
4049 *      difficult to retry a send to this method.  (You can bump the ref count
4050 *      before sending to hold a reference for retry if you are careful.)
4051 *
4052 *      When calling this method, interrupts MUST be enabled.  This is because
4053 *      the BH enable code must have IRQs enabled so that it will not deadlock.
4054 *          --BLG
4055 */
4056static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
4057{
4058        struct net_device *dev = skb->dev;
4059        struct netdev_queue *txq;
4060        struct Qdisc *q;
4061        int rc = -ENOMEM;
4062        bool again = false;
4063
4064        skb_reset_mac_header(skb);
4065
4066        if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
4067                __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
4068
4069        /* Disable soft irqs for various locks below. Also
4070         * stops preemption for RCU.
4071         */
4072        rcu_read_lock_bh();
4073
4074        skb_update_prio(skb);
4075
4076        qdisc_pkt_len_init(skb);
4077#ifdef CONFIG_NET_CLS_ACT
4078        skb->tc_at_ingress = 0;
4079# ifdef CONFIG_NET_EGRESS
4080        if (static_branch_unlikely(&egress_needed_key)) {
4081                skb = sch_handle_egress(skb, &rc, dev);
4082                if (!skb)
4083                        goto out;
4084        }
4085# endif
4086#endif
4087        /* If device/qdisc don't need skb->dst, release it right now while
4088         * its hot in this cpu cache.
4089         */
4090        if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
4091                skb_dst_drop(skb);
4092        else
4093                skb_dst_force(skb);
4094
4095        txq = netdev_core_pick_tx(dev, skb, sb_dev);
4096        q = rcu_dereference_bh(txq->qdisc);
4097
4098        trace_net_dev_queue(skb);
4099        if (q->enqueue) {
4100                rc = __dev_xmit_skb(skb, q, dev, txq);
4101                goto out;
4102        }
4103
4104        /* The device has no queue. Common case for software devices:
4105         * loopback, all the sorts of tunnels...
4106
4107         * Really, it is unlikely that netif_tx_lock protection is necessary
4108         * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
4109         * counters.)
4110         * However, it is possible, that they rely on protection
4111         * made by us here.
4112
4113         * Check this and shot the lock. It is not prone from deadlocks.
4114         *Either shot noqueue qdisc, it is even simpler 8)
4115         */
4116        if (dev->flags & IFF_UP) {
4117                int cpu = smp_processor_id(); /* ok because BHs are off */
4118
4119                if (txq->xmit_lock_owner != cpu) {
4120                        if (dev_xmit_recursion())
4121                                goto recursion_alert;
4122
4123                        skb = validate_xmit_skb(skb, dev, &again);
4124                        if (!skb)
4125                                goto out;
4126
4127                        HARD_TX_LOCK(dev, txq, cpu);
4128
4129                        if (!netif_xmit_stopped(txq)) {
4130                                dev_xmit_recursion_inc();
4131                                skb = dev_hard_start_xmit(skb, dev, txq, &rc);
4132                                dev_xmit_recursion_dec();
4133                                if (dev_xmit_complete(rc)) {
4134                                        HARD_TX_UNLOCK(dev, txq);
4135                                        goto out;
4136                                }
4137                        }
4138                        HARD_TX_UNLOCK(dev, txq);
4139                        net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
4140                                             dev->name);
4141                } else {
4142                        /* Recursion is detected! It is possible,
4143                         * unfortunately
4144                         */
4145recursion_alert:
4146                        net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
4147                                             dev->name);
4148                }
4149        }
4150
4151        rc = -ENETDOWN;
4152        rcu_read_unlock_bh();
4153
4154        atomic_long_inc(&dev->tx_dropped);
4155        kfree_skb_list(skb);
4156        return rc;
4157out:
4158        rcu_read_unlock_bh();
4159        return rc;
4160}
4161
4162int dev_queue_xmit(struct sk_buff *skb)
4163{
4164        return __dev_queue_xmit(skb, NULL);
4165}
4166EXPORT_SYMBOL(dev_queue_xmit);
4167
4168int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
4169{
4170        return __dev_queue_xmit(skb, sb_dev);
4171}
4172EXPORT_SYMBOL(dev_queue_xmit_accel);
4173
4174int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
4175{
4176        struct net_device *dev = skb->dev;
4177        struct sk_buff *orig_skb = skb;
4178        struct netdev_queue *txq;
4179        int ret = NETDEV_TX_BUSY;
4180        bool again = false;
4181
4182        if (unlikely(!netif_running(dev) ||
4183                     !netif_carrier_ok(dev)))
4184                goto drop;
4185
4186        skb = validate_xmit_skb_list(skb, dev, &again);
4187        if (skb != orig_skb)
4188                goto drop;
4189
4190        skb_set_queue_mapping(skb, queue_id);
4191        txq = skb_get_tx_queue(dev, skb);
4192
4193        local_bh_disable();
4194
4195        dev_xmit_recursion_inc();
4196        HARD_TX_LOCK(dev, txq, smp_processor_id());
4197        if (!netif_xmit_frozen_or_drv_stopped(txq))
4198                ret = netdev_start_xmit(skb, dev, txq, false);
4199        HARD_TX_UNLOCK(dev, txq);
4200        dev_xmit_recursion_dec();
4201
4202        local_bh_enable();
4203
4204        if (!dev_xmit_complete(ret))
4205                kfree_skb(skb);
4206
4207        return ret;
4208drop:
4209        atomic_long_inc(&dev->tx_dropped);
4210        kfree_skb_list(skb);
4211        return NET_XMIT_DROP;
4212}
4213EXPORT_SYMBOL(dev_direct_xmit);
4214
4215/*************************************************************************
4216 *                      Receiver routines
4217 *************************************************************************/
4218
4219int netdev_max_backlog __read_mostly = 1000;
4220EXPORT_SYMBOL(netdev_max_backlog);
4221
4222int netdev_tstamp_prequeue __read_mostly = 1;
4223int netdev_budget __read_mostly = 300;
4224/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
4225unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
4226int weight_p __read_mostly = 64;           /* old backlog weight */
4227int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
4228int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
4229int dev_rx_weight __read_mostly = 64;
4230int dev_tx_weight __read_mostly = 64;
4231/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
4232int gro_normal_batch __read_mostly = 8;
4233
4234/* Called with irq disabled */
4235static inline void ____napi_schedule(struct softnet_data *sd,
4236                                     struct napi_struct *napi)
4237{
4238        list_add_tail(&napi->poll_list, &sd->poll_list);
4239        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4240}
4241
4242#ifdef CONFIG_RPS
4243
4244/* One global table that all flow-based protocols share. */
4245struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
4246EXPORT_SYMBOL(rps_sock_flow_table);
4247u32 rps_cpu_mask __read_mostly;
4248EXPORT_SYMBOL(rps_cpu_mask);
4249
4250struct static_key_false rps_needed __read_mostly;
4251EXPORT_SYMBOL(rps_needed);
4252struct static_key_false rfs_needed __read_mostly;
4253EXPORT_SYMBOL(rfs_needed);
4254
4255static struct rps_dev_flow *
4256set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4257            struct rps_dev_flow *rflow, u16 next_cpu)
4258{
4259        if (next_cpu < nr_cpu_ids) {
4260#ifdef CONFIG_RFS_ACCEL
4261                struct netdev_rx_queue *rxqueue;
4262                struct rps_dev_flow_table *flow_table;
4263                struct rps_dev_flow *old_rflow;
4264                u32 flow_id;
4265                u16 rxq_index;
4266                int rc;
4267
4268                /* Should we steer this flow to a different hardware queue? */
4269                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
4270                    !(dev->features & NETIF_F_NTUPLE))
4271                        goto out;
4272                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
4273                if (rxq_index == skb_get_rx_queue(skb))
4274                        goto out;
4275
4276                rxqueue = dev->_rx + rxq_index;
4277                flow_table = rcu_dereference(rxqueue->rps_flow_table);
4278                if (!flow_table)
4279                        goto out;
4280                flow_id = skb_get_hash(skb) & flow_table->mask;
4281                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
4282                                                        rxq_index, flow_id);
4283                if (rc < 0)
4284                        goto out;
4285                old_rflow = rflow;
4286                rflow = &flow_table->flows[flow_id];
4287                rflow->filter = rc;
4288                if (old_rflow->filter == rflow->filter)
4289                        old_rflow->filter = RPS_NO_FILTER;
4290        out:
4291#endif
4292                rflow->last_qtail =
4293                        per_cpu(softnet_data, next_cpu).input_queue_head;
4294        }
4295
4296        rflow->cpu = next_cpu;
4297        return rflow;
4298}
4299
4300/*
4301 * get_rps_cpu is called from netif_receive_skb and returns the target
4302 * CPU from the RPS map of the receiving queue for a given skb.
4303 * rcu_read_lock must be held on entry.
4304 */
4305static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4306                       struct rps_dev_flow **rflowp)
4307{
4308        const struct rps_sock_flow_table *sock_flow_table;
4309        struct netdev_rx_queue *rxqueue = dev->_rx;
4310        struct rps_dev_flow_table *flow_table;
4311        struct rps_map *map;
4312        int cpu = -1;
4313        u32 tcpu;
4314        u32 hash;
4315
4316        if (skb_rx_queue_recorded(skb)) {
4317                u16 index = skb_get_rx_queue(skb);
4318
4319                if (unlikely(index >= dev->real_num_rx_queues)) {
4320                        WARN_ONCE(dev->real_num_rx_queues > 1,
4321                                  "%s received packet on queue %u, but number "
4322                                  "of RX queues is %u\n",
4323                                  dev->name, index, dev->real_num_rx_queues);
4324                        goto done;
4325                }
4326                rxqueue += index;
4327        }
4328
4329        /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
4330
4331        flow_table = rcu_dereference(rxqueue->rps_flow_table);
4332        map = rcu_dereference(rxqueue->rps_map);
4333        if (!flow_table && !map)
4334                goto done;
4335
4336        skb_reset_network_header(skb);
4337        hash = skb_get_hash(skb);
4338        if (!hash)
4339                goto done;
4340
4341        sock_flow_table = rcu_dereference(rps_sock_flow_table);
4342        if (flow_table && sock_flow_table) {
4343                struct rps_dev_flow *rflow;
4344                u32 next_cpu;
4345                u32 ident;
4346
4347                /* First check into global flow table if there is a match */
4348                ident = sock_flow_table->ents[hash & sock_flow_table->mask];
4349                if ((ident ^ hash) & ~rps_cpu_mask)
4350                        goto try_rps;
4351
4352                next_cpu = ident & rps_cpu_mask;
4353
4354                /* OK, now we know there is a match,
4355                 * we can look at the local (per receive queue) flow table
4356                 */
4357                rflow = &flow_table->flows[hash & flow_table->mask];
4358                tcpu = rflow->cpu;
4359
4360                /*
4361                 * If the desired CPU (where last recvmsg was done) is
4362                 * different from current CPU (one in the rx-queue flow
4363                 * table entry), switch if one of the following holds:
4364                 *   - Current CPU is unset (>= nr_cpu_ids).
4365                 *   - Current CPU is offline.
4366                 *   - The current CPU's queue tail has advanced beyond the
4367                 *     last packet that was enqueued using this table entry.
4368                 *     This guarantees that all previous packets for the flow
4369                 *     have been dequeued, thus preserving in order delivery.
4370                 */
4371                if (unlikely(tcpu != next_cpu) &&
4372                    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
4373                     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
4374                      rflow->last_qtail)) >= 0)) {
4375                        tcpu = next_cpu;
4376                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
4377                }
4378
4379                if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
4380                        *rflowp = rflow;
4381                        cpu = tcpu;
4382                        goto done;
4383                }
4384        }
4385
4386try_rps:
4387
4388        if (map) {
4389                tcpu = map->cpus[reciprocal_scale(hash, map->len)];
4390                if (cpu_online(tcpu)) {
4391                        cpu = tcpu;
4392                        goto done;
4393                }
4394        }
4395
4396done:
4397        return cpu;
4398}
4399
4400#ifdef CONFIG_RFS_ACCEL
4401
4402/**
4403 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
4404 * @dev: Device on which the filter was set
4405 * @rxq_index: RX queue index
4406 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
4407 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
4408 *
4409 * Drivers that implement ndo_rx_flow_steer() should periodically call
4410 * this function for each installed filter and remove the filters for
4411 * which it returns %true.
4412 */
4413bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
4414                         u32 flow_id, u16 filter_id)
4415{
4416        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
4417        struct rps_dev_flow_table *flow_table;
4418        struct rps_dev_flow *rflow;
4419        bool expire = true;
4420        unsigned int cpu;
4421
4422        rcu_read_lock();
4423        flow_table = rcu_dereference(rxqueue->rps_flow_table);
4424        if (flow_table && flow_id <= flow_table->mask) {
4425                rflow = &flow_table->flows[flow_id];
4426                cpu = READ_ONCE(rflow->cpu);
4427                if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
4428                    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
4429                           rflow->last_qtail) <
4430                     (int)(10 * flow_table->mask)))
4431                        expire = false;
4432        }
4433        rcu_read_unlock();
4434        return expire;
4435}
4436EXPORT_SYMBOL(rps_may_expire_flow);
4437
4438#endif /* CONFIG_RFS_ACCEL */
4439
4440/* Called from hardirq (IPI) context */
4441static void rps_trigger_softirq(void *data)
4442{
4443        struct softnet_data *sd = data;
4444
4445        ____napi_schedule(sd, &sd->backlog);
4446        sd->received_rps++;
4447}
4448
4449#endif /* CONFIG_RPS */
4450
4451/*
4452 * Check if this softnet_data structure is another cpu one
4453 * If yes, queue it to our IPI list and return 1
4454 * If no, return 0
4455 */
4456static int rps_ipi_queued(struct softnet_data *sd)
4457{
4458#ifdef CONFIG_RPS
4459        struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
4460
4461        if (sd != mysd) {
4462                sd->rps_ipi_next = mysd->rps_ipi_list;
4463                mysd->rps_ipi_list = sd;
4464
4465                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4466                return 1;
4467        }
4468#endif /* CONFIG_RPS */
4469        return 0;
4470}
4471
4472#ifdef CONFIG_NET_FLOW_LIMIT
4473int netdev_flow_limit_table_len __read_mostly = (1 << 12);
4474#endif
4475
4476static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
4477{
4478#ifdef CONFIG_NET_FLOW_LIMIT
4479        struct sd_flow_limit *fl;
4480        struct softnet_data *sd;
4481        unsigned int old_flow, new_flow;
4482
4483        if (qlen < (netdev_max_backlog >> 1))
4484                return false;
4485
4486        sd = this_cpu_ptr(&softnet_data);
4487
4488        rcu_read_lock();
4489        fl = rcu_dereference(sd->flow_limit);
4490        if (fl) {
4491                new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
4492                old_flow = fl->history[fl->history_head];
4493                fl->history[fl->history_head] = new_flow;
4494
4495                fl->history_head++;
4496                fl->history_head &= FLOW_LIMIT_HISTORY - 1;
4497
4498                if (likely(fl->buckets[old_flow]))
4499                        fl->buckets[old_flow]--;
4500
4501                if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
4502                        fl->count++;
4503                        rcu_read_unlock();
4504                        return true;
4505                }
4506        }
4507        rcu_read_unlock();
4508#endif
4509        return false;
4510}
4511
4512/*
4513 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
4514 * queue (may be a remote CPU queue).
4515 */
4516static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
4517                              unsigned int *qtail)
4518{
4519        struct softnet_data *sd;
4520        unsigned long flags;
4521        unsigned int qlen;
4522
4523        sd = &per_cpu(softnet_data, cpu);
4524
4525        local_irq_save(flags);
4526
4527        rps_lock(sd);
4528        if (!netif_running(skb->dev))
4529                goto drop;
4530        qlen = skb_queue_len(&sd->input_pkt_queue);
4531        if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
4532                if (qlen) {
4533enqueue:
4534                        __skb_queue_tail(&sd->input_pkt_queue, skb);
4535                        input_queue_tail_incr_save(sd, qtail);
4536                        rps_unlock(sd);
4537                        local_irq_restore(flags);
4538                        return NET_RX_SUCCESS;
4539                }
4540
4541                /* Schedule NAPI for backlog device
4542                 * We can use non atomic operation since we own the queue lock
4543                 */
4544                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
4545                        if (!rps_ipi_queued(sd))
4546                                ____napi_schedule(sd, &sd->backlog);
4547                }
4548                goto enqueue;
4549        }
4550
4551drop:
4552        sd->dropped++;
4553        rps_unlock(sd);
4554
4555        local_irq_restore(flags);
4556
4557        atomic_long_inc(&skb->dev->rx_dropped);
4558        kfree_skb(skb);
4559        return NET_RX_DROP;
4560}
4561
4562static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
4563{
4564        struct net_device *dev = skb->dev;
4565        struct netdev_rx_queue *rxqueue;
4566
4567        rxqueue = dev->_rx;
4568
4569        if (skb_rx_queue_recorded(skb)) {
4570                u16 index = skb_get_rx_queue(skb);
4571
4572                if (unlikely(index >= dev->real_num_rx_queues)) {
4573                        WARN_ONCE(dev->real_num_rx_queues > 1,
4574                                  "%s received packet on queue %u, but number "
4575                                  "of RX queues is %u\n",
4576                                  dev->name, index, dev->real_num_rx_queues);
4577
4578                        return rxqueue; /* Return first rxqueue */
4579                }
4580                rxqueue += index;
4581        }
4582        return rxqueue;
4583}
4584
4585static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4586                                     struct xdp_buff *xdp,
4587                                     struct bpf_prog *xdp_prog)
4588{
4589        struct netdev_rx_queue *rxqueue;
4590        void *orig_data, *orig_data_end;
4591        u32 metalen, act = XDP_DROP;
4592        __be16 orig_eth_type;
4593        struct ethhdr *eth;
4594        bool orig_bcast;
4595        int hlen, off;
4596        u32 mac_len;
4597
4598        /* Reinjected packets coming from act_mirred or similar should
4599         * not get XDP generic processing.
4600         */
4601        if (skb_is_redirected(skb))
4602                return XDP_PASS;
4603
4604        /* XDP packets must be linear and must have sufficient headroom
4605         * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
4606         * native XDP provides, thus we need to do it here as well.
4607         */
4608        if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
4609            skb_headroom(skb) < XDP_PACKET_HEADROOM) {
4610                int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
4611                int troom = skb->tail + skb->data_len - skb->end;
4612
4613                /* In case we have to go down the path and also linearize,
4614                 * then lets do the pskb_expand_head() work just once here.
4615                 */
4616                if (pskb_expand_head(skb,
4617                                     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
4618                                     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
4619                        goto do_drop;
4620                if (skb_linearize(skb))
4621                        goto do_drop;
4622        }
4623
4624        /* The XDP program wants to see the packet starting at the MAC
4625         * header.
4626         */
4627        mac_len = skb->data - skb_mac_header(skb);
4628        hlen = skb_headlen(skb) + mac_len;
4629        xdp->data = skb->data - mac_len;
4630        xdp->data_meta = xdp->data;
4631        xdp->data_end = xdp->data + hlen;
4632        xdp->data_hard_start = skb->data - skb_headroom(skb);
4633
4634        /* SKB "head" area always have tailroom for skb_shared_info */
4635        xdp->frame_sz  = (void *)skb_end_pointer(skb) - xdp->data_hard_start;
4636        xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
4637
4638        orig_data_end = xdp->data_end;
4639        orig_data = xdp->data;
4640        eth = (struct ethhdr *)xdp->data;
4641        orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
4642        orig_eth_type = eth->h_proto;
4643
4644        rxqueue = netif_get_rxqueue(skb);
4645        xdp->rxq = &rxqueue->xdp_rxq;
4646
4647        act = bpf_prog_run_xdp(xdp_prog, xdp);
4648
4649        /* check if bpf_xdp_adjust_head was used */
4650        off = xdp->data - orig_data;
4651        if (off) {
4652                if (off > 0)
4653                        __skb_pull(skb, off);
4654                else if (off < 0)
4655                        __skb_push(skb, -off);
4656
4657                skb->mac_header += off;
4658                skb_reset_network_header(skb);
4659        }
4660
4661        /* check if bpf_xdp_adjust_tail was used */
4662        off = xdp->data_end - orig_data_end;
4663        if (off != 0) {
4664                skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4665                skb->len += off; /* positive on grow, negative on shrink */
4666        }
4667
4668        /* check if XDP changed eth hdr such SKB needs update */
4669        eth = (struct ethhdr *)xdp->data;
4670        if ((orig_eth_type != eth->h_proto) ||
4671            (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
4672                __skb_push(skb, ETH_HLEN);
4673                skb->protocol = eth_type_trans(skb, skb->dev);
4674        }
4675
4676        switch (act) {
4677        case XDP_REDIRECT:
4678        case XDP_TX:
4679                __skb_push(skb, mac_len);
4680                break;
4681        case XDP_PASS:
4682                metalen = xdp->data - xdp->data_meta;
4683                if (metalen)
4684                        skb_metadata_set(skb, metalen);
4685                break;
4686        default:
4687                bpf_warn_invalid_xdp_action(act);
4688                /* fall through */
4689        case XDP_ABORTED:
4690                trace_xdp_exception(skb->dev, xdp_prog, act);
4691                /* fall through */
4692        case XDP_DROP:
4693        do_drop:
4694                kfree_skb(skb);
4695                break;
4696        }
4697
4698        return act;
4699}
4700
4701/* When doing generic XDP we have to bypass the qdisc layer and the
4702 * network taps in order to match in-driver-XDP behavior.
4703 */
4704void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
4705{
4706        struct net_device *dev = skb->dev;
4707        struct netdev_queue *txq;
4708        bool free_skb = true;
4709        int cpu, rc;
4710
4711        txq = netdev_core_pick_tx(dev, skb, NULL);
4712        cpu = smp_processor_id();
4713        HARD_TX_LOCK(dev, txq, cpu);
4714        if (!netif_xmit_stopped(txq)) {
4715                rc = netdev_start_xmit(skb, dev, txq, 0);
4716                if (dev_xmit_complete(rc))
4717                        free_skb = false;
4718        }
4719        HARD_TX_UNLOCK(dev, txq);
4720        if (free_skb) {
4721                trace_xdp_exception(dev, xdp_prog, XDP_TX);
4722                kfree_skb(skb);
4723        }
4724}
4725
4726static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
4727
4728int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
4729{
4730        if (xdp_prog) {
4731                struct xdp_buff xdp;
4732                u32 act;
4733                int err;
4734
4735                act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
4736                if (act != XDP_PASS) {
4737                        switch (act) {
4738                        case XDP_REDIRECT:
4739                                err = xdp_do_generic_redirect(skb->dev, skb,
4740                                                              &xdp, xdp_prog);
4741                                if (err)
4742                                        goto out_redir;
4743                                break;
4744                        case XDP_TX:
4745                                generic_xdp_tx(skb, xdp_prog);
4746                                break;
4747                        }
4748                        return XDP_DROP;
4749                }
4750        }
4751        return XDP_PASS;
4752out_redir:
4753        kfree_skb(skb);
4754        return XDP_DROP;
4755}
4756EXPORT_SYMBOL_GPL(do_xdp_generic);
4757
4758static int netif_rx_internal(struct sk_buff *skb)
4759{
4760        int ret;
4761
4762        net_timestamp_check(netdev_tstamp_prequeue, skb);
4763
4764        trace_netif_rx(skb);
4765
4766#ifdef CONFIG_RPS
4767        if (static_branch_unlikely(&rps_needed)) {
4768                struct rps_dev_flow voidflow, *rflow = &voidflow;
4769                int cpu;
4770
4771                preempt_disable();
4772                rcu_read_lock();
4773
4774                cpu = get_rps_cpu(skb->dev, skb, &rflow);
4775                if (cpu < 0)
4776                        cpu = smp_processor_id();
4777
4778                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4779
4780                rcu_read_unlock();
4781                preempt_enable();
4782        } else
4783#endif
4784        {
4785                unsigned int qtail;
4786
4787                ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
4788                put_cpu();
4789        }
4790        return ret;
4791}
4792
4793/**
4794 *      netif_rx        -       post buffer to the network code
4795 *      @skb: buffer to post
4796 *
4797 *      This function receives a packet from a device driver and queues it for
4798 *      the upper (protocol) levels to process.  It always succeeds. The buffer
4799 *      may be dropped during processing for congestion control or by the
4800 *      protocol layers.
4801 *
4802 *      return values:
4803 *      NET_RX_SUCCESS  (no congestion)
4804 *      NET_RX_DROP     (packet was dropped)
4805 *
4806 */
4807
4808int netif_rx(struct sk_buff *skb)
4809{
4810        int ret;
4811
4812        trace_netif_rx_entry(skb);
4813
4814        ret = netif_rx_internal(skb);
4815        trace_netif_rx_exit(ret);
4816
4817        return ret;
4818}
4819EXPORT_SYMBOL(netif_rx);
4820
4821int netif_rx_ni(struct sk_buff *skb)
4822{
4823        int err;
4824
4825        trace_netif_rx_ni_entry(skb);
4826
4827        preempt_disable();
4828        err = netif_rx_internal(skb);
4829        if (local_softirq_pending())
4830                do_softirq();
4831        preempt_enable();
4832        trace_netif_rx_ni_exit(err);
4833
4834        return err;
4835}
4836EXPORT_SYMBOL(netif_rx_ni);
4837
4838static __latent_entropy void net_tx_action(struct softirq_action *h)
4839{
4840        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4841
4842        if (sd->completion_queue) {
4843                struct sk_buff *clist;
4844
4845                local_irq_disable();
4846                clist = sd->completion_queue;
4847                sd->completion_queue = NULL;
4848                local_irq_enable();
4849
4850                while (clist) {
4851                        struct sk_buff *skb = clist;
4852
4853                        clist = clist->next;
4854
4855                        WARN_ON(refcount_read(&skb->users));
4856                        if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
4857                                trace_consume_skb(skb);
4858                        else
4859                                trace_kfree_skb(skb, net_tx_action);
4860
4861                        if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
4862                                __kfree_skb(skb);
4863                        else
4864                                __kfree_skb_defer(skb);
4865                }
4866
4867                __kfree_skb_flush();
4868        }
4869
4870        if (sd->output_queue) {
4871                struct Qdisc *head;
4872
4873                local_irq_disable();
4874                head = sd->output_queue;
4875                sd->output_queue = NULL;
4876                sd->output_queue_tailp = &sd->output_queue;
4877                local_irq_enable();
4878
4879                while (head) {
4880                        struct Qdisc *q = head;
4881                        spinlock_t *root_lock = NULL;
4882
4883                        head = head->next_sched;
4884
4885                        if (!(q->flags & TCQ_F_NOLOCK)) {
4886                                root_lock = qdisc_lock(q);
4887                                spin_lock(root_lock);
4888                        }
4889                        /* We need to make sure head->next_sched is read
4890                         * before clearing __QDISC_STATE_SCHED
4891                         */
4892                        smp_mb__before_atomic();
4893                        clear_bit(__QDISC_STATE_SCHED, &q->state);
4894                        qdisc_run(q);
4895                        if (root_lock)
4896                                spin_unlock(root_lock);
4897                }
4898        }
4899
4900        xfrm_dev_backlog(sd);
4901}
4902
4903#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
4904/* This hook is defined here for ATM LANE */
4905int (*br_fdb_test_addr_hook)(struct net_device *dev,
4906                             unsigned char *addr) __read_mostly;
4907EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
4908#endif
4909
4910static inline struct sk_buff *
4911sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4912                   struct net_device *orig_dev)
4913{
4914#ifdef CONFIG_NET_CLS_ACT
4915        struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
4916        struct tcf_result cl_res;
4917
4918        /* If there's at least one ingress present somewhere (so
4919         * we get here via enabled static key), remaining devices
4920         * that are not configured with an ingress qdisc will bail
4921         * out here.
4922         */
4923        if (!miniq)
4924                return skb;
4925
4926        if (*pt_prev) {
4927                *ret = deliver_skb(skb, *pt_prev, orig_dev);
4928                *pt_prev = NULL;
4929        }
4930
4931        qdisc_skb_cb(skb)->pkt_len = skb->len;
4932        skb->tc_at_ingress = 1;
4933        mini_qdisc_bstats_cpu_update(miniq, skb);
4934
4935        switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list,
4936                                     &cl_res, false)) {
4937        case TC_ACT_OK:
4938        case TC_ACT_RECLASSIFY:
4939                skb->tc_index = TC_H_MIN(cl_res.classid);
4940                break;
4941        case TC_ACT_SHOT:
4942                mini_qdisc_qstats_cpu_drop(miniq);
4943                kfree_skb(skb);
4944                return NULL;
4945        case TC_ACT_STOLEN:
4946        case TC_ACT_QUEUED:
4947        case TC_ACT_TRAP:
4948                consume_skb(skb);
4949                return NULL;
4950        case TC_ACT_REDIRECT:
4951                /* skb_mac_header check was done by cls/act_bpf, so
4952                 * we can safely push the L2 header back before
4953                 * redirecting to another netdev
4954                 */
4955                __skb_push(skb, skb->mac_len);
4956                skb_do_redirect(skb);
4957                return NULL;
4958        case TC_ACT_CONSUMED:
4959                return NULL;
4960        default:
4961                break;
4962        }
4963#endif /* CONFIG_NET_CLS_ACT */
4964        return skb;
4965}
4966
4967/**
4968 *      netdev_is_rx_handler_busy - check if receive handler is registered
4969 *      @dev: device to check
4970 *
4971 *      Check if a receive handler is already registered for a given device.
4972 *      Return true if there one.
4973 *
4974 *      The caller must hold the rtnl_mutex.
4975 */
4976bool netdev_is_rx_handler_busy(struct net_device *dev)
4977{
4978        ASSERT_RTNL();
4979        return dev && rtnl_dereference(dev->rx_handler);
4980}
4981EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
4982
4983/**
4984 *      netdev_rx_handler_register - register receive handler
4985 *      @dev: device to register a handler for
4986 *      @rx_handler: receive handler to register
4987 *      @rx_handler_data: data pointer that is used by rx handler
4988 *
4989 *      Register a receive handler for a device. This handler will then be
4990 *      called from __netif_receive_skb. A negative errno code is returned
4991 *      on a failure.
4992 *
4993 *      The caller must hold the rtnl_mutex.
4994 *
4995 *      For a general description of rx_handler, see enum rx_handler_result.
4996 */
4997int netdev_rx_handler_register(struct net_device *dev,
4998                               rx_handler_func_t *rx_handler,
4999                               void *rx_handler_data)
5000{

5001        if (netdev_is_rx_handler_busy(dev))
5002                return -EBUSY;
5003
5004        if (dev->priv_flags & IFF_NO_RX_HANDLER)
5005                return -EINVAL;
5006
5007        /* Note: rx_handler_data must be set before rx_handler */
5008        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
5009        rcu_assign_pointer(dev->rx_handler, rx_handler);
5010
5011        return 0;
5012}
5013EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
5014
5015/**
5016 *      netdev_rx_handler_unregister - unregister receive handler
5017 *      @dev: device to unregister a handler from
5018 *
5019 *      Unregister a receive handler from a device.
5020 *
5021 *      The caller must hold the rtnl_mutex.
5022 */
5023void netdev_rx_handler_unregister(struct net_device *dev)
5024{
5025
5026        ASSERT_RTNL();
5027        RCU_INIT_POINTER(dev->rx_handler, NULL);
5028        /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
5029         * section has a guarantee to see a non NULL rx_handler_data
5030         * as well.
5031         */
5032        synchronize_net();
5033        RCU_INIT_POINTER(dev->rx_handler_data, NULL);
5034}
5035EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
5036
5037/*
5038 * Limit the use of PFMEMALLOC reserves to those protocols that implement
5039 * the special handling of PFMEMALLOC skbs.
5040 */
5041static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
5042{
5043        switch (skb->protocol) {
5044        case htons(ETH_P_ARP):
5045        case htons(ETH_P_IP):
5046        case htons(ETH_P_IPV6):
5047        case htons(ETH_P_8021Q):
5048        case htons(ETH_P_8021AD):
5049                return true;
5050        default:
5051                return false;
5052        }
5053}
5054
5055static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
5056                             int *ret, struct net_device *orig_dev)
5057{
5058        if (nf_hook_ingress_active(skb)) {
5059                int ingress_retval;
5060
5061                if (*pt_prev) {
5062                        *ret = deliver_skb(skb, *pt_prev, orig_dev);
5063                        *pt_prev = NULL;
5064                }
5065
5066                rcu_read_lock();
5067                ingress_retval = nf_hook_ingress(skb);
5068                rcu_read_unlock();
5069                return ingress_retval;
5070        }
5071        return 0;
5072}
5073
5074static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
5075                                    struct packet_type **ppt_prev)
5076{
5077        struct packet_type *ptype, *pt_prev;
5078        rx_handler_func_t *rx_handler;
5079        struct sk_buff *skb = *pskb;
5080        struct net_device *orig_dev;
5081        bool deliver_exact = false;
5082        int ret = NET_RX_DROP;
5083        __be16 type;
5084
5085        net_timestamp_check(!netdev_tstamp_prequeue, skb);
5086
5087        trace_netif_receive_skb(skb);
5088
5089        orig_dev = skb->dev;
5090
5091        skb_reset_network_header(skb);
5092        if (!skb_transport_header_was_set(skb))
5093                skb_reset_transport_header(skb);
5094        skb_reset_mac_len(skb);
5095
5096        pt_prev = NULL;
5097
5098another_round:
5099        skb->skb_iif = skb->dev->ifindex;
5100
5101        __this_cpu_inc(softnet_data.processed);
5102
5103        if (static_branch_unlikely(&generic_xdp_needed_key)) {
5104                int ret2;
5105
5106                preempt_disable();
5107                ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
5108                preempt_enable();
5109
5110                if (ret2 != XDP_PASS) {
5111                        ret = NET_RX_DROP;
5112                        goto out;
5113                }
5114                skb_reset_mac_len(skb);
5115        }
5116
5117        if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
5118            skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
5119                skb = skb_vlan_untag(skb);
5120                if (unlikely(!skb))
5121                        goto out;
5122        }
5123
5124        if (skb_skip_tc_classify(skb))
5125                goto skip_classify;
5126
5127        if (pfmemalloc)
5128                goto skip_taps;
5129
5130        list_for_each_entry_rcu(ptype, &ptype_all, list) {
5131                if (pt_prev)
5132                        ret = deliver_skb(skb, pt_prev, orig_dev);
5133                pt_prev = ptype;
5134        }
5135
5136        list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
5137                if (pt_prev)
5138                        ret = deliver_skb(skb, pt_prev, orig_dev);
5139                pt_prev = ptype;
5140        }
5141
5142skip_taps:
5143#ifdef CONFIG_NET_INGRESS
5144        if (static_branch_unlikely(&ingress_needed_key)) {
5145                skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
5146                if (!skb)
5147                        goto out;
5148
5149                if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
5150                        goto out;
5151        }
5152#endif
5153        skb_reset_redirect(skb);
5154skip_classify:
5155        if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
5156                goto drop;
5157
5158        if (skb_vlan_tag_present(skb)) {
5159                if (pt_prev) {
5160                        ret = deliver_skb(skb, pt_prev, orig_dev);
5161                        pt_prev = NULL;
5162                }
5163                if (vlan_do_receive(&skb))
5164                        goto another_round;
5165                else if (unlikely(!skb))
5166                        goto out;
5167        }
5168
5169        rx_handler = rcu_dereference(skb->dev->rx_handler);
5170        if (rx_handler) {
5171                if (pt_prev) {
5172                        ret = deliver_skb(skb, pt_prev, orig_dev);
5173                        pt_prev = NULL;
5174                }
5175                switch (rx_handler(&skb)) {
5176                case RX_HANDLER_CONSUMED:
5177                        ret = NET_RX_SUCCESS;
5178                        goto out;
5179                case RX_HANDLER_ANOTHER:
5180                        goto another_round;
5181                case RX_HANDLER_EXACT:
5182                        deliver_exact = true;
5183                case RX_HANDLER_PASS:
5184                        break;
5185                default:
5186                        BUG();
5187                }
5188        }
5189
5190        if (unlikely(skb_vlan_tag_present(skb))) {
5191check_vlan_id:
5192                if (skb_vlan_tag_get_id(skb)) {
5193                        /* Vlan id is non 0 and vlan_do_receive() above couldn't
5194                         * find vlan device.
5195                         */
5196                        skb->pkt_type = PACKET_OTHERHOST;
5197                } else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
5198                           skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
5199                        /* Outer header is 802.1P with vlan 0, inner header is
5200                         * 802.1Q or 802.1AD and vlan_do_receive() above could
5201                         * not find vlan dev for vlan id 0.
5202                         */
5203                        __vlan_hwaccel_clear_tag(skb);
5204                        skb = skb_vlan_untag(skb);
5205                        if (unlikely(!skb))
5206                                goto out;
5207                        if (vlan_do_receive(&skb))
5208                                /* After stripping off 802.1P header with vlan 0
5209                                 * vlan dev is found for inner header.
5210                                 */
5211                                goto another_round;
5212                        else if (unlikely(!skb))
5213                                goto out;
5214                        else
5215                                /* We have stripped outer 802.1P vlan 0 header.
5216                                 * But could not find vlan dev.
5217                                 * check again for vlan id to set OTHERHOST.
5218                                 */
5219                                goto check_vlan_id;
5220                }
5221                /* Note: we might in the future use prio bits
5222                 * and set skb->priority like in vlan_do_receive()
5223                 * For the time being, just ignore Priority Code Point
5224                 */
5225                __vlan_hwaccel_clear_tag(skb);
5226        }
5227
5228        type = skb->protocol;
5229
5230        /* deliver only exact match when indicated */
5231        if (likely(!deliver_exact)) {
5232                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5233                                       &ptype_base[ntohs(type) &
5234                                                   PTYPE_HASH_MASK]);
5235        }
5236
5237        deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5238                               &orig_dev->ptype_specific);
5239
5240        if (unlikely(skb->dev != orig_dev)) {
5241                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5242                                       &skb->dev->ptype_specific);
5243        }
5244
5245        if (pt_prev) {
5246                if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
5247                        goto drop;
5248                *ppt_prev = pt_prev;
5249        } else {
5250drop:
5251                if (!deliver_exact)
5252                        atomic_long_inc(&skb->dev->rx_dropped);
5253                else
5254                        atomic_long_inc(&skb->dev->rx_nohandler);
5255                kfree_skb(skb);
5256                /* Jamal, now you will not able to escape explaining
5257                 * me how you were going to use this. :-)
5258                 */
5259                ret = NET_RX_DROP;
5260        }
5261
5262out:
5263        /* The invariant here is that if *ppt_prev is not NULL
5264         * then skb should also be non-NULL.
5265         *
5266         * Apparently *ppt_prev assignment above holds this invariant due to
5267         * skb dereferencing near it.
5268         */
5269        *pskb = skb;
5270        return ret;
5271}
5272
5273static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
5274{
5275        struct net_device *orig_dev = skb->dev;
5276        struct packet_type *pt_prev = NULL;
5277        int ret;
5278
5279        ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5280        if (pt_prev)
5281                ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
5282                                         skb->dev, pt_prev, orig_dev);
5283        return ret;
5284}
5285
5286/**
5287 *      netif_receive_skb_core - special purpose version of netif_receive_skb
5288 *      @skb: buffer to process
5289 *
5290 *      More direct receive version of netif_receive_skb().  It should
5291 *      only be used by callers that have a need to skip RPS and Generic XDP.
5292 *      Caller must also take care of handling if ``(page_is_)pfmemalloc``.
5293 *
5294 *      This function may only be called from softirq context and interrupts
5295 *      should be enabled.
5296 *
5297 *      Return values (usually ignored):
5298 *      NET_RX_SUCCESS: no congestion
5299 *      NET_RX_DROP: packet was dropped
5300 */
5301int netif_receive_skb_core(struct sk_buff *skb)
5302{
5303        int ret;
5304
5305        rcu_read_lock();
5306        ret = __netif_receive_skb_one_core(skb, false);
5307        rcu_read_unlock();
5308
5309        return ret;
5310}
5311EXPORT_SYMBOL(netif_receive_skb_core);
5312
5313static inline void __netif_receive_skb_list_ptype(struct list_head *head,
5314                                                  struct packet_type *pt_prev,
5315                                                  struct net_device *orig_dev)
5316{
5317        struct sk_buff *skb, *next;
5318
5319        if (!pt_prev)
5320                return;
5321        if (list_empty(head))
5322                return;
5323        if (pt_prev->list_func != NULL)
5324                INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
5325                                   ip_list_rcv, head, pt_prev, orig_dev);
5326        else
5327                list_for_each_entry_safe(skb, next, head, list) {
5328                        skb_list_del_init(skb);
5329                        pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
5330                }
5331}
5332
5333static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
5334{
5335        /* Fast-path assumptions:
5336         * - There is no RX handler.
5337         * - Only one packet_type matches.
5338         * If either of these fails, we will end up doing some per-packet
5339         * processing in-line, then handling the 'last ptype' for the whole
5340         * sublist.  This can't cause out-of-order delivery to any single ptype,
5341         * because the 'last ptype' must be constant across the sublist, and all
5342         * other ptypes are handled per-packet.
5343         */
5344        /* Current (common) ptype of sublist */
5345        struct packet_type *pt_curr = NULL;
5346        /* Current (common) orig_dev of sublist */
5347        struct net_device *od_curr = NULL;
5348        struct list_head sublist;
5349        struct sk_buff *skb, *next;
5350
5351        INIT_LIST_HEAD(&sublist);
5352        list_for_each_entry_safe(skb, next, head, list) {
5353                struct net_device *orig_dev = skb->dev;
5354                struct packet_type *pt_prev = NULL;
5355
5356                skb_list_del_init(skb);
5357                __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5358                if (!pt_prev)
5359                        continue;
5360                if (pt_curr != pt_prev || od_curr != orig_dev) {
5361                        /* dispatch old sublist */
5362                        __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5363                        /* start new sublist */
5364                        INIT_LIST_HEAD(&sublist);
5365                        pt_curr = pt_prev;
5366                        od_curr = orig_dev;
5367                }
5368                list_add_tail(&skb->list, &sublist);
5369        }
5370
5371        /* dispatch final sublist */
5372        __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5373}
5374
5375static int __netif_receive_skb(struct sk_buff *skb)
5376{
5377        int ret;
5378
5379        if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
5380                unsigned int noreclaim_flag;
5381
5382                /*
5383                 * PFMEMALLOC skbs are special, they should
5384                 * - be delivered to SOCK_MEMALLOC sockets only
5385                 * - stay away from userspace
5386                 * - have bounded memory usage
5387                 *
5388                 * Use PF_MEMALLOC as this saves us from propagating the allocation
5389                 * context down to all allocation sites.
5390                 */
5391                noreclaim_flag = memalloc_noreclaim_save();
5392                ret = __netif_receive_skb_one_core(skb, true);
5393                memalloc_noreclaim_restore(noreclaim_flag);
5394        } else
5395                ret = __netif_receive_skb_one_core(skb, false);
5396
5397        return ret;
5398}
5399
5400static void __netif_receive_skb_list(struct list_head *head)
5401{
5402        unsigned long noreclaim_flag = 0;
5403        struct sk_buff *skb, *next;
5404        bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
5405
5406        list_for_each_entry_safe(skb, next, head, list) {
5407                if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
5408                        struct list_head sublist;
5409
5410                        /* Handle the previous sublist */
5411                        list_cut_before(&sublist, head, &skb->list);
5412                        if (!list_empty(&sublist))
5413                                __netif_receive_skb_list_core(&sublist, pfmemalloc);
5414                        pfmemalloc = !pfmemalloc;
5415                        /* See comments in __netif_receive_skb */
5416                        if (pfmemalloc)
5417                                noreclaim_flag = memalloc_noreclaim_save();
5418                        else
5419                                memalloc_noreclaim_restore(noreclaim_flag);
5420                }
5421        }
5422        /* Handle the remaining sublist */
5423        if (!list_empty(head))
5424                __netif_receive_skb_list_core(head, pfmemalloc);
5425        /* Restore pflags */
5426        if (pfmemalloc)
5427                memalloc_noreclaim_restore(noreclaim_flag);
5428}
5429
5430static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
5431{
5432        struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
5433        struct bpf_prog *new = xdp->prog;
5434        int ret = 0;
5435
5436        if (new) {
5437                u32 i;
5438
5439                /* generic XDP does not work with DEVMAPs that can
5440                 * have a bpf_prog installed on an entry
5441                 */
5442                for (i = 0; i < new->aux->used_map_cnt; i++) {
5443                        if (dev_map_can_have_prog(new->aux->used_maps[i]))
5444                                return -EINVAL;
5445                }
5446        }
5447
5448        switch (xdp->command) {
5449        case XDP_SETUP_PROG:
5450                rcu_assign_pointer(dev->xdp_prog, new);
5451                if (old)
5452                        bpf_prog_put(old);
5453
5454                if (old && !new) {
5455                        static_branch_dec(&generic_xdp_needed_key);
5456                } else if (new && !old) {
5457                        static_branch_inc(&generic_xdp_needed_key);
5458                        dev_disable_lro(dev);
5459                        dev_disable_gro_hw(dev);
5460                }
5461                break;
5462
5463        case XDP_QUERY_PROG:
5464                xdp->prog_id = old ? old->aux->id : 0;
5465                break;
5466
5467        default:
5468                ret = -EINVAL;
5469                break;
5470        }
5471
5472        return ret;
5473}
5474
5475static int netif_receive_skb_internal(struct sk_buff *skb)
5476{
5477        int ret;
5478
5479        net_timestamp_check(netdev_tstamp_prequeue, skb);
5480
5481        if (skb_defer_rx_timestamp(skb))
5482                return NET_RX_SUCCESS;
5483
5484        rcu_read_lock();
5485#ifdef CONFIG_RPS
5486        if (static_branch_unlikely(&rps_needed)) {
5487                struct rps_dev_flow voidflow, *rflow = &voidflow;
5488                int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5489
5490                if (cpu >= 0) {
5491                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5492                        rcu_read_unlock();
5493                        return ret;
5494                }
5495        }
5496#endif
5497        ret = __netif_receive_skb(skb);
5498        rcu_read_unlock();
5499        return ret;
5500}
5501
5502static void netif_receive_skb_list_internal(struct list_head *head)
5503{
5504        struct sk_buff *skb, *next;
5505        struct list_head sublist;
5506
5507        INIT_LIST_HEAD(&sublist);
5508        list_for_each_entry_safe(skb, next, head, list) {
5509                net_timestamp_check(netdev_tstamp_prequeue, skb);
5510                skb_list_del_init(skb);
5511                if (!skb_defer_rx_timestamp(skb))
5512                        list_add_tail(&skb->list, &sublist);
5513        }
5514        list_splice_init(&sublist, head);
5515
5516        rcu_read_lock();
5517#ifdef CONFIG_RPS
5518        if (static_branch_unlikely(&rps_needed)) {
5519                list_for_each_entry_safe(skb, next, head, list) {
5520                        struct rps_dev_flow voidflow, *rflow = &voidflow;
5521                        int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5522
5523                        if (cpu >= 0) {
5524                                /* Will be handled, remove from list */
5525                                skb_list_del_init(skb);
5526                                enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5527                        }
5528                }
5529        }
5530#endif
5531        __netif_receive_skb_list(head);
5532        rcu_read_unlock();
5533}
5534
5535/**
5536 *      netif_receive_skb - process receive buffer from network
5537 *      @skb: buffer to process
5538 *
5539 *      netif_receive_skb() is the main receive data processing function.
5540 *      It always succeeds. The buffer may be dropped during processing
5541 *      for congestion control or by the protocol layers.
5542 *
5543 *      This function may only be called from softirq context and interrupts
5544 *      should be enabled.
5545 *
5546 *      Return values (usually ignored):
5547 *      NET_RX_SUCCESS: no congestion
5548 *      NET_RX_DROP: packet was dropped
5549 */
5550int netif_receive_skb(struct sk_buff *skb)
5551{
5552        int ret;
5553
5554        trace_netif_receive_skb_entry(skb);
5555
5556        ret = netif_receive_skb_internal(skb);
5557        trace_netif_receive_skb_exit(ret);
5558
5559        return ret;
5560}
5561EXPORT_SYMBOL(netif_receive_skb);
5562
5563/**
5564 *      netif_receive_skb_list - process many receive buffers from network
5565 *      @head: list of skbs to process.
5566 *
5567 *      Since return value of netif_receive_skb() is normally ignored, and
5568 *      wouldn't be meaningful for a list, this function returns void.
5569 *
5570 *      This function may only be called from softirq context and interrupts
5571 *      should be enabled.
5572 */
5573void netif_receive_skb_list(struct list_head *head)
5574{
5575        struct sk_buff *skb;
5576
5577        if (list_empty(head))
5578                return;
5579        if (trace_netif_receive_skb_list_entry_enabled()) {
5580                list_for_each_entry(skb, head, list)
5581                        trace_netif_receive_skb_list_entry(skb);
5582        }
5583        netif_receive_skb_list_internal(head);
5584        trace_netif_receive_skb_list_exit(0);
5585}
5586EXPORT_SYMBOL(netif_receive_skb_list);
5587
5588DEFINE_PER_CPU(struct work_struct, flush_works);
5589
5590/* Network device is going away, flush any packets still pending */
5591static void flush_backlog(struct work_struct *work)
5592{
5593        struct sk_buff *skb, *tmp;
5594        struct softnet_data *sd;
5595
5596        local_bh_disable();
5597        sd = this_cpu_ptr(&softnet_data);
5598
5599        local_irq_disable();
5600        rps_lock(sd);
5601        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
5602                if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5603                        __skb_unlink(skb, &sd->input_pkt_queue);
5604                        dev_kfree_skb_irq(skb);
5605                        input_queue_head_incr(sd);
5606                }
5607        }
5608        rps_unlock(sd);
5609        local_irq_enable();
5610
5611        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
5612                if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5613                        __skb_unlink(skb, &sd->process_queue);
5614                        kfree_skb(skb);
5615                        input_queue_head_incr(sd);
5616                }
5617        }
5618        local_bh_enable();
5619}
5620
5621static void flush_all_backlogs(void)
5622{
5623        unsigned int cpu;
5624
5625        get_online_cpus();
5626
5627        for_each_online_cpu(cpu)
5628                queue_work_on(cpu, system_highpri_wq,
5629                              per_cpu_ptr(&flush_works, cpu));
5630
5631        for_each_online_cpu(cpu)
5632                flush_work(per_cpu_ptr(&flush_works, cpu));
5633
5634        put_online_cpus();
5635}
5636
5637/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
5638static void gro_normal_list(struct napi_struct *napi)
5639{
5640        if (!napi->rx_count)
5641                return;
5642        netif_receive_skb_list_internal(&napi->rx_list);
5643        INIT_LIST_HEAD(&napi->rx_list);
5644        napi->rx_count = 0;
5645}
5646
5647/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
5648 * pass the whole batch up to the stack.
5649 */
5650static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
5651{
5652        list_add_tail(&skb->list, &napi->rx_list);
5653        if (++napi->rx_count >= gro_normal_batch)
5654                gro_normal_list(napi);
5655}
5656
5657INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
5658INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
5659static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
5660{
5661        struct packet_offload *ptype;
5662        __be16 type = skb->protocol;
5663        struct list_head *head = &offload_base;
5664        int err = -ENOENT;
5665
5666        BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
5667
5668        if (NAPI_GRO_CB(skb)->count == 1) {
5669                skb_shinfo(skb)->gso_size = 0;
5670                goto out;
5671        }
5672
5673        rcu_read_lock();
5674        list_for_each_entry_rcu(ptype, head, list) {
5675                if (ptype->type != type || !ptype->callbacks.gro_complete)
5676                        continue;
5677
5678                err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
5679                                         ipv6_gro_complete, inet_gro_complete,
5680                                         skb, 0);
5681                break;
5682        }
5683        rcu_read_unlock();
5684
5685        if (err) {
5686                WARN_ON(&ptype->list == head);
5687                kfree_skb(skb);
5688                return NET_RX_SUCCESS;
5689        }
5690
5691out:
5692        gro_normal_one(napi, skb);
5693        return NET_RX_SUCCESS;
5694}
5695
5696static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
5697                                   bool flush_old)
5698{
5699        struct list_head *head = &napi->gro_hash[index].list;
5700        struct sk_buff *skb, *p;
5701
5702        list_for_each_entry_safe_reverse(skb, p, head, list) {
5703                if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
5704                        return;
5705                skb_list_del_init(skb);
5706                napi_gro_complete(napi, skb);
5707                napi->gro_hash[index].count--;
5708        }
5709
5710        if (!napi->gro_hash[index].count)
5711                __clear_bit(index, &napi->gro_bitmask);
5712}
5713
5714/* napi->gro_hash[].list contains packets ordered by age.
5715 * youngest packets at the head of it.
5716 * Complete skbs in reverse order to reduce latencies.
5717 */
5718void napi_gro_flush(struct napi_struct *napi, bool flush_old)
5719{
5720        unsigned long bitmask = napi->gro_bitmask;
5721        unsigned int i, base = ~0U;
5722
5723        while ((i = ffs(bitmask)) != 0) {
5724                bitmask >>= i;
5725                base += i;
5726                __napi_gro_flush_chain(napi, base, flush_old);
5727        }
5728}
5729EXPORT_SYMBOL(napi_gro_flush);
5730
5731static struct list_head *gro_list_prepare(struct napi_struct *napi,
5732                                          struct sk_buff *skb)
5733{
5734        unsigned int maclen = skb->dev->hard_header_len;
5735        u32 hash = skb_get_hash_raw(skb);
5736        struct list_head *head;
5737        struct sk_buff *p;
5738
5739        head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
5740        list_for_each_entry(p, head, list) {
5741                unsigned long diffs;
5742
5743                NAPI_GRO_CB(p)->flush = 0;
5744
5745                if (hash != skb_get_hash_raw(p)) {
5746                        NAPI_GRO_CB(p)->same_flow = 0;
5747                        continue;
5748                }
5749
5750                diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
5751                diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
5752                if (skb_vlan_tag_present(p))
5753                        diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
5754                diffs |= skb_metadata_dst_cmp(p, skb);
5755                diffs |= skb_metadata_differs(p, skb);
5756                if (maclen == ETH_HLEN)
5757                        diffs |= compare_ether_header(skb_mac_header(p),
5758                                                      skb_mac_header(skb));
5759                else if (!diffs)
5760                        diffs = memcmp(skb_mac_header(p),
5761                                       skb_mac_header(skb),
5762                                       maclen);
5763                NAPI_GRO_CB(p)->same_flow = !diffs;
5764        }
5765
5766        return head;
5767}
5768
5769static void skb_gro_reset_offset(struct sk_buff *skb)
5770{
5771        const struct skb_shared_info *pinfo = skb_shinfo(skb);
5772        const skb_frag_t *frag0 = &pinfo->frags[0];
5773
5774        NAPI_GRO_CB(skb)->data_offset = 0;
5775        NAPI_GRO_CB(skb)->frag0 = NULL;
5776        NAPI_GRO_CB(skb)->frag0_len = 0;
5777
5778        if (!skb_headlen(skb) && pinfo->nr_frags &&
5779            !PageHighMem(skb_frag_page(frag0))) {
5780                NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
5781                NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
5782                                                    skb_frag_size(frag0),
5783                                                    skb->end - skb->tail);
5784        }
5785}
5786
5787static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
5788{
5789        struct skb_shared_info *pinfo = skb_shinfo(skb);
5790
5791        BUG_ON(skb->end - skb->tail < grow);
5792
5793        memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
5794
5795        skb->data_len -= grow;
5796        skb->tail += grow;
5797
5798        skb_frag_off_add(&pinfo->frags[0], grow);
5799        skb_frag_size_sub(&pinfo->frags[0], grow);
5800
5801        if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
5802                skb_frag_unref(skb, 0);
5803                memmove(pinfo->frags, pinfo->frags + 1,
5804                        --pinfo->nr_frags * sizeof(pinfo->frags[0]));
5805        }
5806}
5807
5808static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
5809{
5810        struct sk_buff *oldest;
5811
5812        oldest = list_last_entry(head, struct sk_buff, list);
5813
5814        /* We are called with head length >= MAX_GRO_SKBS, so this is
5815         * impossible.
5816         */
5817        if (WARN_ON_ONCE(!oldest))
5818                return;
5819
5820        /* Do not adjust napi->gro_hash[].count, caller is adding a new
5821         * SKB to the chain.
5822         */
5823        skb_list_del_init(oldest);
5824        napi_gro_complete(napi, oldest);
5825}
5826
5827INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
5828                                                           struct sk_buff *));
5829INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
5830                                                           struct sk_buff *));
5831static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5832{
5833        u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
5834        struct list_head *head = &offload_base;
5835        struct packet_offload *ptype;
5836        __be16 type = skb->protocol;
5837        struct list_head *gro_head;
5838        struct sk_buff *pp = NULL;
5839        enum gro_result ret;
5840        int same_flow;
5841        int grow;
5842
5843        if (netif_elide_gro(skb->dev))
5844                goto normal;
5845
5846        gro_head = gro_list_prepare(napi, skb);
5847
5848        rcu_read_lock();
5849        list_for_each_entry_rcu(ptype, head, list) {
5850                if (ptype->type != type || !ptype->callbacks.gro_receive)
5851                        continue;
5852
5853                skb_set_network_header(skb, skb_gro_offset(skb));
5854                skb_reset_mac_len(skb);
5855                NAPI_GRO_CB(skb)->same_flow = 0;
5856                NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
5857                NAPI_GRO_CB(skb)->free = 0;
5858                NAPI_GRO_CB(skb)->encap_mark = 0;
5859                NAPI_GRO_CB(skb)->recursion_counter = 0;
5860                NAPI_GRO_CB(skb)->is_fou = 0;
5861                NAPI_GRO_CB(skb)->is_atomic = 1;
5862                NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
5863
5864                /* Setup for GRO checksum validation */
5865                switch (skb->ip_summed) {
5866                case CHECKSUM_COMPLETE:
5867                        NAPI_GRO_CB(skb)->csum = skb->csum;
5868                        NAPI_GRO_CB(skb)->csum_valid = 1;
5869                        NAPI_GRO_CB(skb)->csum_cnt = 0;
5870                        break;
5871                case CHECKSUM_UNNECESSARY:
5872                        NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
5873                        NAPI_GRO_CB(skb)->csum_valid = 0;
5874                        break;
5875                default:
5876                        NAPI_GRO_CB(skb)->csum_cnt = 0;
5877                        NAPI_GRO_CB(skb)->csum_valid = 0;
5878                }
5879
5880                pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
5881                                        ipv6_gro_receive, inet_gro_receive,
5882                                        gro_head, skb);
5883                break;
5884        }
5885        rcu_read_unlock();
5886
5887        if (&ptype->list == head)
5888                goto normal;
5889
5890        if (PTR_ERR(pp) == -EINPROGRESS) {
5891                ret = GRO_CONSUMED;
5892                goto ok;
5893        }
5894
5895        same_flow = NAPI_GRO_CB(skb)->same_flow;
5896        ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
5897
5898        if (pp) {
5899                skb_list_del_init(pp);
5900                napi_gro_complete(napi, pp);
5901                napi->gro_hash[hash].count--;
5902        }
5903
5904        if (same_flow)
5905                goto ok;
5906
5907        if (NAPI_GRO_CB(skb)->flush)
5908                goto normal;
5909
5910        if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
5911                gro_flush_oldest(napi, gro_head);
5912        } else {
5913                napi->gro_hash[hash].count++;
5914        }
5915        NAPI_GRO_CB(skb)->count = 1;
5916        NAPI_GRO_CB(skb)->age = jiffies;
5917        NAPI_GRO_CB(skb)->last = skb;
5918        skb_shinfo(skb)->gso_size = skb_gro_len(skb);
5919        list_add(&skb->list, gro_head);
5920        ret = GRO_HELD;
5921
5922pull:
5923        grow = skb_gro_offset(skb) - skb_headlen(skb);
5924        if (grow > 0)
5925                gro_pull_from_frag0(skb, grow);
5926ok:
5927        if (napi->gro_hash[hash].count) {
5928                if (!test_bit(hash, &napi->gro_bitmask))
5929                        __set_bit(hash, &napi->gro_bitmask);
5930        } else if (test_bit(hash, &napi->gro_bitmask)) {
5931                __clear_bit(hash, &napi->gro_bitmask);
5932        }
5933
5934        return ret;
5935
5936normal:
5937        ret = GRO_NORMAL;
5938        goto pull;
5939}
5940
5941struct packet_offload *gro_find_receive_by_type(__be16 type)
5942{
5943        struct list_head *offload_head = &offload_base;
5944        struct packet_offload *ptype;
5945
5946        list_for_each_entry_rcu(ptype, offload_head, list) {
5947                if (ptype->type != type || !ptype->callbacks.gro_receive)
5948                        continue;
5949                return ptype;
5950        }
5951        return NULL;
5952}
5953EXPORT_SYMBOL(gro_find_receive_by_type);
5954
5955struct packet_offload *gro_find_complete_by_type(__be16 type)
5956{
5957        struct list_head *offload_head = &offload_base;
5958        struct packet_offload *ptype;
5959
5960        list_for_each_entry_rcu(ptype, offload_head, list) {
5961                if (ptype->type != type || !ptype->callbacks.gro_complete)
5962                        continue;
5963                return ptype;
5964        }
5965        return NULL;
5966}
5967EXPORT_SYMBOL(gro_find_complete_by_type);
5968
5969static void napi_skb_free_stolen_head(struct sk_buff *skb)
5970{
5971        skb_dst_drop(skb);
5972        skb_ext_put(skb);
5973        kmem_cache_free(skbuff_head_cache, skb);
5974}
5975
5976static gro_result_t napi_skb_finish(struct napi_struct *napi,
5977                                    struct sk_buff *skb,
5978                                    gro_result_t ret)
5979{
5980        switch (ret) {
5981        case GRO_NORMAL:
5982                gro_normal_one(napi, skb);
5983                break;
5984
5985        case GRO_DROP:
5986                kfree_skb(skb);
5987                break;
5988
5989        case GRO_MERGED_FREE:
5990                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
5991                        napi_skb_free_stolen_head(skb);
5992                else
5993                        __kfree_skb(skb);
5994                break;
5995
5996        case GRO_HELD:
5997        case GRO_MERGED:
5998        case GRO_CONSUMED:
5999                break;
6000        }

6001
6002        return ret;
6003}
6004
6005gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
6006{
6007        gro_result_t ret;
6008
6009        skb_mark_napi_id(skb, napi);
6010        trace_napi_gro_receive_entry(skb);
6011
6012        skb_gro_reset_offset(skb);
6013
6014        ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
6015        trace_napi_gro_receive_exit(ret);
6016
6017        return ret;
6018}
6019EXPORT_SYMBOL(napi_gro_receive);
6020
6021static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
6022{
6023        if (unlikely(skb->pfmemalloc)) {
6024                consume_skb(skb);
6025                return;
6026        }
6027        __skb_pull(skb, skb_headlen(skb));
6028        /* restore the reserve we had after netdev_alloc_skb_ip_align() */
6029        skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
6030        __vlan_hwaccel_clear_tag(skb);
6031        skb->dev = napi->dev;
6032        skb->skb_iif = 0;
6033
6034        /* eth_type_trans() assumes pkt_type is PACKET_HOST */
6035        skb->pkt_type = PACKET_HOST;
6036
6037        skb->encapsulation = 0;
6038        skb_shinfo(skb)->gso_type = 0;
6039        skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
6040        skb_ext_reset(skb);
6041
6042        napi->skb = skb;
6043}
6044
6045struct sk_buff *napi_get_frags(struct napi_struct *napi)
6046{
6047        struct sk_buff *skb = napi->skb;
6048
6049        if (!skb) {
6050                skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
6051                if (skb) {
6052                        napi->skb = skb;
6053                        skb_mark_napi_id(skb, napi);
6054                }
6055        }
6056        return skb;
6057}
6058EXPORT_SYMBOL(napi_get_frags);
6059
6060static gro_result_t napi_frags_finish(struct napi_struct *napi,
6061                                      struct sk_buff *skb,
6062                                      gro_result_t ret)
6063{
6064        switch (ret) {
6065        case GRO_NORMAL:
6066        case GRO_HELD:
6067                __skb_push(skb, ETH_HLEN);
6068                skb->protocol = eth_type_trans(skb, skb->dev);
6069                if (ret == GRO_NORMAL)
6070                        gro_normal_one(napi, skb);
6071                break;
6072
6073        case GRO_DROP:
6074                napi_reuse_skb(napi, skb);
6075                break;
6076
6077        case GRO_MERGED_FREE:
6078                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
6079                        napi_skb_free_stolen_head(skb);
6080                else
6081                        napi_reuse_skb(napi, skb);
6082                break;
6083
6084        case GRO_MERGED:
6085        case GRO_CONSUMED:
6086                break;
6087        }
6088
6089        return ret;
6090}
6091
6092/* Upper GRO stack assumes network header starts at gro_offset=0
6093 * Drivers could call both napi_gro_frags() and napi_gro_receive()
6094 * We copy ethernet header into skb->data to have a common layout.
6095 */
6096static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
6097{
6098        struct sk_buff *skb = napi->skb;
6099        const struct ethhdr *eth;
6100        unsigned int hlen = sizeof(*eth);
6101
6102        napi->skb = NULL;
6103
6104        skb_reset_mac_header(skb);
6105        skb_gro_reset_offset(skb);
6106
6107        if (unlikely(skb_gro_header_hard(skb, hlen))) {
6108                eth = skb_gro_header_slow(skb, hlen, 0);
6109                if (unlikely(!eth)) {
6110                        net_warn_ratelimited("%s: dropping impossible skb from %s\n",
6111                                             __func__, napi->dev->name);
6112                        napi_reuse_skb(napi, skb);
6113                        return NULL;
6114                }
6115        } else {
6116                eth = (const struct ethhdr *)skb->data;
6117                gro_pull_from_frag0(skb, hlen);
6118                NAPI_GRO_CB(skb)->frag0 += hlen;
6119                NAPI_GRO_CB(skb)->frag0_len -= hlen;
6120        }
6121        __skb_pull(skb, hlen);
6122
6123        /*
6124         * This works because the only protocols we care about don't require
6125         * special handling.
6126         * We'll fix it up properly in napi_frags_finish()
6127         */
6128        skb->protocol = eth->h_proto;
6129
6130        return skb;
6131}
6132
6133gro_result_t napi_gro_frags(struct napi_struct *napi)
6134{
6135        gro_result_t ret;
6136        struct sk_buff *skb = napi_frags_skb(napi);
6137
6138        if (!skb)
6139                return GRO_DROP;
6140
6141        trace_napi_gro_frags_entry(skb);
6142
6143        ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
6144        trace_napi_gro_frags_exit(ret);
6145
6146        return ret;
6147}
6148EXPORT_SYMBOL(napi_gro_frags);
6149
6150/* Compute the checksum from gro_offset and return the folded value
6151 * after adding in any pseudo checksum.
6152 */
6153__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
6154{
6155        __wsum wsum;
6156        __sum16 sum;
6157
6158        wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
6159
6160        /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
6161        sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
6162        /* See comments in __skb_checksum_complete(). */
6163        if (likely(!sum)) {
6164                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
6165                    !skb->csum_complete_sw)
6166                        netdev_rx_csum_fault(skb->dev, skb);
6167        }
6168
6169        NAPI_GRO_CB(skb)->csum = wsum;
6170        NAPI_GRO_CB(skb)->csum_valid = 1;
6171
6172        return sum;
6173}
6174EXPORT_SYMBOL(__skb_gro_checksum_complete);
6175
6176static void net_rps_send_ipi(struct softnet_data *remsd)
6177{
6178#ifdef CONFIG_RPS
6179        while (remsd) {
6180                struct softnet_data *next = remsd->rps_ipi_next;
6181
6182                if (cpu_online(remsd->cpu))
6183                        smp_call_function_single_async(remsd->cpu, &remsd->csd);
6184                remsd = next;
6185        }
6186#endif
6187}
6188
6189/*
6190 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
6191 * Note: called with local irq disabled, but exits with local irq enabled.
6192 */
6193static void net_rps_action_and_irq_enable(struct softnet_data *sd)
6194{
6195#ifdef CONFIG_RPS
6196        struct softnet_data *remsd = sd->rps_ipi_list;
6197
6198        if (remsd) {
6199                sd->rps_ipi_list = NULL;
6200
6201                local_irq_enable();
6202
6203                /* Send pending IPI's to kick RPS processing on remote cpus. */
6204                net_rps_send_ipi(remsd);
6205        } else
6206#endif
6207                local_irq_enable();
6208}
6209
6210static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
6211{
6212#ifdef CONFIG_RPS
6213        return sd->rps_ipi_list != NULL;
6214#else
6215        return false;
6216#endif
6217}
6218
6219static int process_backlog(struct napi_struct *napi, int quota)
6220{
6221        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
6222        bool again = true;
6223        int work = 0;
6224
6225        /* Check if we have pending ipi, its better to send them now,
6226         * not waiting net_rx_action() end.
6227         */
6228        if (sd_has_rps_ipi_waiting(sd)) {
6229                local_irq_disable();
6230                net_rps_action_and_irq_enable(sd);
6231        }
6232
6233        napi->weight = dev_rx_weight;
6234        while (again) {
6235                struct sk_buff *skb;
6236
6237                while ((skb = __skb_dequeue(&sd->process_queue))) {
6238                        rcu_read_lock();
6239                        __netif_receive_skb(skb);
6240                        rcu_read_unlock();
6241                        input_queue_head_incr(sd);
6242                        if (++work >= quota)
6243                                return work;
6244
6245                }
6246
6247                local_irq_disable();
6248                rps_lock(sd);
6249                if (skb_queue_empty(&sd->input_pkt_queue)) {
6250                        /*
6251                         * Inline a custom version of __napi_complete().
6252                         * only current cpu owns and manipulates this napi,
6253                         * and NAPI_STATE_SCHED is the only possible flag set
6254                         * on backlog.
6255                         * We can use a plain write instead of clear_bit(),
6256                         * and we dont need an smp_mb() memory barrier.
6257                         */
6258                        napi->state = 0;
6259                        again = false;
6260                } else {
6261                        skb_queue_splice_tail_init(&sd->input_pkt_queue,
6262                                                   &sd->process_queue);
6263                }
6264                rps_unlock(sd);
6265                local_irq_enable();
6266        }
6267
6268        return work;
6269}
6270
6271/**
6272 * __napi_schedule - schedule for receive
6273 * @n: entry to schedule
6274 *
6275 * The entry's receive function will be scheduled to run.
6276 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
6277 */
6278void __napi_schedule(struct napi_struct *n)
6279{
6280        unsigned long flags;
6281
6282        local_irq_save(flags);
6283        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6284        local_irq_restore(flags);
6285}
6286EXPORT_SYMBOL(__napi_schedule);
6287
6288/**
6289 *      napi_schedule_prep - check if napi can be scheduled
6290 *      @n: napi context
6291 *
6292 * Test if NAPI routine is already running, and if not mark
6293 * it as running.  This is used as a condition variable
6294 * insure only one NAPI poll instance runs.  We also make
6295 * sure there is no pending NAPI disable.
6296 */
6297bool napi_schedule_prep(struct napi_struct *n)
6298{
6299        unsigned long val, new;
6300
6301        do {
6302                val = READ_ONCE(n->state);
6303                if (unlikely(val & NAPIF_STATE_DISABLE))
6304                        return false;
6305                new = val | NAPIF_STATE_SCHED;
6306
6307                /* Sets STATE_MISSED bit if STATE_SCHED was already set
6308                 * This was suggested by Alexander Duyck, as compiler
6309                 * emits better code than :
6310                 * if (val & NAPIF_STATE_SCHED)
6311                 *     new |= NAPIF_STATE_MISSED;
6312                 */
6313                new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
6314                                                   NAPIF_STATE_MISSED;
6315        } while (cmpxchg(&n->state, val, new) != val);
6316
6317        return !(val & NAPIF_STATE_SCHED);
6318}
6319EXPORT_SYMBOL(napi_schedule_prep);
6320
6321/**
6322 * __napi_schedule_irqoff - schedule for receive
6323 * @n: entry to schedule
6324 *
6325 * Variant of __napi_schedule() assuming hard irqs are masked
6326 */
6327void __napi_schedule_irqoff(struct napi_struct *n)
6328{
6329        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6330}
6331EXPORT_SYMBOL(__napi_schedule_irqoff);
6332
6333bool napi_complete_done(struct napi_struct *n, int work_done)
6334{
6335        unsigned long flags, val, new, timeout = 0;
6336        bool ret = true;
6337
6338        /*
6339         * 1) Don't let napi dequeue from the cpu poll list
6340         *    just in case its running on a different cpu.
6341         * 2) If we are busy polling, do nothing here, we have
6342         *    the guarantee we will be called later.
6343         */
6344        if (unlikely(n->state & (NAPIF_STATE_NPSVC |
6345                                 NAPIF_STATE_IN_BUSY_POLL)))
6346                return false;
6347
6348        if (work_done) {
6349                if (n->gro_bitmask)
6350                        timeout = READ_ONCE(n->dev->gro_flush_timeout);
6351                n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
6352        }
6353        if (n->defer_hard_irqs_count > 0) {
6354                n->defer_hard_irqs_count--;
6355                timeout = READ_ONCE(n->dev->gro_flush_timeout);
6356                if (timeout)
6357                        ret = false;
6358        }
6359        if (n->gro_bitmask) {
6360                /* When the NAPI instance uses a timeout and keeps postponing
6361                 * it, we need to bound somehow the time packets are kept in
6362                 * the GRO layer
6363                 */
6364                napi_gro_flush(n, !!timeout);
6365        }
6366
6367        gro_normal_list(n);
6368
6369        if (unlikely(!list_empty(&n->poll_list))) {
6370                /* If n->poll_list is not empty, we need to mask irqs */
6371                local_irq_save(flags);
6372                list_del_init(&n->poll_list);
6373                local_irq_restore(flags);
6374        }
6375
6376        do {
6377                val = READ_ONCE(n->state);
6378
6379                WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
6380
6381                new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
6382
6383                /* If STATE_MISSED was set, leave STATE_SCHED set,
6384                 * because we will call napi->poll() one more time.
6385                 * This C code was suggested by Alexander Duyck to help gcc.
6386                 */
6387                new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
6388                                                    NAPIF_STATE_SCHED;
6389        } while (cmpxchg(&n->state, val, new) != val);
6390
6391        if (unlikely(val & NAPIF_STATE_MISSED)) {
6392                __napi_schedule(n);
6393                return false;
6394        }
6395
6396        if (timeout)
6397                hrtimer_start(&n->timer, ns_to_ktime(timeout),
6398                              HRTIMER_MODE_REL_PINNED);
6399        return ret;
6400}
6401EXPORT_SYMBOL(napi_complete_done);
6402
6403/* must be called under rcu_read_lock(), as we dont take a reference */
6404static struct napi_struct *napi_by_id(unsigned int napi_id)
6405{
6406        unsigned int hash = napi_id % HASH_SIZE(napi_hash);
6407        struct napi_struct *napi;
6408
6409        hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
6410                if (napi->napi_id == napi_id)
6411                        return napi;
6412
6413        return NULL;
6414}
6415
6416#if defined(CONFIG_NET_RX_BUSY_POLL)
6417
6418#define BUSY_POLL_BUDGET 8
6419
6420static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
6421{
6422        int rc;
6423
6424        /* Busy polling means there is a high chance device driver hard irq
6425         * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
6426         * set in napi_schedule_prep().
6427         * Since we are about to call napi->poll() once more, we can safely
6428         * clear NAPI_STATE_MISSED.
6429         *
6430         * Note: x86 could use a single "lock and ..." instruction
6431         * to perform these two clear_bit()
6432         */
6433        clear_bit(NAPI_STATE_MISSED, &napi->state);
6434        clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
6435
6436        local_bh_disable();
6437
6438        /* All we really want here is to re-enable device interrupts.
6439         * Ideally, a new ndo_busy_poll_stop() could avoid another round.
6440         */
6441        rc = napi->poll(napi, BUSY_POLL_BUDGET);
6442        /* We can't gro_normal_list() here, because napi->poll() might have
6443         * rearmed the napi (napi_complete_done()) in which case it could
6444         * already be running on another CPU.
6445         */
6446        trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
6447        netpoll_poll_unlock(have_poll_lock);
6448        if (rc == BUSY_POLL_BUDGET) {
6449                /* As the whole budget was spent, we still own the napi so can
6450                 * safely handle the rx_list.
6451                 */
6452                gro_normal_list(napi);
6453                __napi_schedule(napi);
6454        }
6455        local_bh_enable();
6456}
6457
6458void napi_busy_loop(unsigned int napi_id,
6459                    bool (*loop_end)(void *, unsigned long),
6460                    void *loop_end_arg)
6461{
6462        unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
6463        int (*napi_poll)(struct napi_struct *napi, int budget);
6464        void *have_poll_lock = NULL;
6465        struct napi_struct *napi;
6466
6467restart:
6468        napi_poll = NULL;
6469
6470        rcu_read_lock();
6471
6472        napi = napi_by_id(napi_id);
6473        if (!napi)
6474                goto out;
6475
6476        preempt_disable();
6477        for (;;) {
6478                int work = 0;
6479
6480                local_bh_disable();
6481                if (!napi_poll) {
6482                        unsigned long val = READ_ONCE(napi->state);
6483
6484                        /* If multiple threads are competing for this napi,
6485                         * we avoid dirtying napi->state as much as we can.
6486                         */
6487                        if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
6488                                   NAPIF_STATE_IN_BUSY_POLL))
6489                                goto count;
6490                        if (cmpxchg(&napi->state, val,
6491                                    val | NAPIF_STATE_IN_BUSY_POLL |
6492                                          NAPIF_STATE_SCHED) != val)
6493                                goto count;
6494                        have_poll_lock = netpoll_poll_lock(napi);
6495                        napi_poll = napi->poll;
6496                }
6497                work = napi_poll(napi, BUSY_POLL_BUDGET);
6498                trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
6499                gro_normal_list(napi);
6500count:
6501                if (work > 0)
6502                        __NET_ADD_STATS(dev_net(napi->dev),
6503                                        LINUX_MIB_BUSYPOLLRXPACKETS, work);
6504                local_bh_enable();
6505
6506                if (!loop_end || loop_end(loop_end_arg, start_time))
6507                        break;
6508
6509                if (unlikely(need_resched())) {
6510                        if (napi_poll)
6511                                busy_poll_stop(napi, have_poll_lock);
6512                        preempt_enable();
6513                        rcu_read_unlock();
6514                        cond_resched();
6515                        if (loop_end(loop_end_arg, start_time))
6516                                return;
6517                        goto restart;
6518                }
6519                cpu_relax();
6520        }
6521        if (napi_poll)
6522                busy_poll_stop(napi, have_poll_lock);
6523        preempt_enable();
6524out:
6525        rcu_read_unlock();
6526}
6527EXPORT_SYMBOL(napi_busy_loop);
6528
6529#endif /* CONFIG_NET_RX_BUSY_POLL */
6530
6531static void napi_hash_add(struct napi_struct *napi)
6532{
6533        if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
6534            test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
6535                return;
6536
6537        spin_lock(&napi_hash_lock);
6538
6539        /* 0..NR_CPUS range is reserved for sender_cpu use */
6540        do {
6541                if (unlikely(++napi_gen_id < MIN_NAPI_ID))
6542                        napi_gen_id = MIN_NAPI_ID;
6543        } while (napi_by_id(napi_gen_id));
6544        napi->napi_id = napi_gen_id;
6545
6546        hlist_add_head_rcu(&napi->napi_hash_node,
6547                           &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
6548
6549        spin_unlock(&napi_hash_lock);
6550}
6551
6552/* Warning : caller is responsible to make sure rcu grace period
6553 * is respected before freeing memory containing @napi
6554 */
6555bool napi_hash_del(struct napi_struct *napi)
6556{
6557        bool rcu_sync_needed = false;
6558
6559        spin_lock(&napi_hash_lock);
6560
6561        if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
6562                rcu_sync_needed = true;
6563                hlist_del_rcu(&napi->napi_hash_node);
6564        }
6565        spin_unlock(&napi_hash_lock);
6566        return rcu_sync_needed;
6567}
6568EXPORT_SYMBOL_GPL(napi_hash_del);
6569
6570static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
6571{
6572        struct napi_struct *napi;
6573
6574        napi = container_of(timer, struct napi_struct, timer);
6575
6576        /* Note : we use a relaxed variant of napi_schedule_prep() not setting
6577         * NAPI_STATE_MISSED, since we do not react to a device IRQ.
6578         */
6579        if (!napi_disable_pending(napi) &&
6580            !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
6581                __napi_schedule_irqoff(napi);
6582
6583        return HRTIMER_NORESTART;
6584}
6585
6586static void init_gro_hash(struct napi_struct *napi)
6587{
6588        int i;
6589
6590        for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6591                INIT_LIST_HEAD(&napi->gro_hash[i].list);
6592                napi->gro_hash[i].count = 0;
6593        }
6594        napi->gro_bitmask = 0;
6595}
6596
6597void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
6598                    int (*poll)(struct napi_struct *, int), int weight)
6599{
6600        INIT_LIST_HEAD(&napi->poll_list);
6601        hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
6602        napi->timer.function = napi_watchdog;
6603        init_gro_hash(napi);
6604        napi->skb = NULL;
6605        INIT_LIST_HEAD(&napi->rx_list);
6606        napi->rx_count = 0;
6607        napi->poll = poll;
6608        if (weight > NAPI_POLL_WEIGHT)
6609                netdev_err_once(dev, "%s() called with weight %d\n", __func__,
6610                                weight);
6611        napi->weight = weight;
6612        list_add(&napi->dev_list, &dev->napi_list);
6613        napi->dev = dev;
6614#ifdef CONFIG_NETPOLL
6615        napi->poll_owner = -1;
6616#endif
6617        set_bit(NAPI_STATE_SCHED, &napi->state);
6618        napi_hash_add(napi);
6619}
6620EXPORT_SYMBOL(netif_napi_add);
6621
6622void napi_disable(struct napi_struct *n)
6623{
6624        might_sleep();
6625        set_bit(NAPI_STATE_DISABLE, &n->state);
6626
6627        while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
6628                msleep(1);
6629        while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
6630                msleep(1);
6631
6632        hrtimer_cancel(&n->timer);
6633
6634        clear_bit(NAPI_STATE_DISABLE, &n->state);
6635}
6636EXPORT_SYMBOL(napi_disable);
6637
6638static void flush_gro_hash(struct napi_struct *napi)
6639{
6640        int i;
6641
6642        for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6643                struct sk_buff *skb, *n;
6644
6645                list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
6646                        kfree_skb(skb);
6647                napi->gro_hash[i].count = 0;
6648        }
6649}
6650
6651/* Must be called in process context */
6652void netif_napi_del(struct napi_struct *napi)
6653{
6654        might_sleep();
6655        if (napi_hash_del(napi))
6656                synchronize_net();
6657        list_del_init(&napi->dev_list);
6658        napi_free_frags(napi);
6659
6660        flush_gro_hash(napi);
6661        napi->gro_bitmask = 0;
6662}
6663EXPORT_SYMBOL(netif_napi_del);
6664
6665static int napi_poll(struct napi_struct *n, struct list_head *repoll)
6666{
6667        void *have;
6668        int work, weight;
6669
6670        list_del_init(&n->poll_list);
6671
6672        have = netpoll_poll_lock(n);
6673
6674        weight = n->weight;
6675
6676        /* This NAPI_STATE_SCHED test is for avoiding a race
6677         * with netpoll's poll_napi().  Only the entity which
6678         * obtains the lock and sees NAPI_STATE_SCHED set will
6679         * actually make the ->poll() call.  Therefore we avoid
6680         * accidentally calling ->poll() when NAPI is not scheduled.
6681         */
6682        work = 0;
6683        if (test_bit(NAPI_STATE_SCHED, &n->state)) {
6684                work = n->poll(n, weight);
6685                trace_napi_poll(n, work, weight);
6686        }
6687
6688        WARN_ON_ONCE(work > weight);
6689
6690        if (likely(work < weight))
6691                goto out_unlock;
6692
6693        /* Drivers must not modify the NAPI state if they
6694         * consume the entire weight.  In such cases this code
6695         * still "owns" the NAPI instance and therefore can
6696         * move the instance around on the list at-will.
6697         */
6698        if (unlikely(napi_disable_pending(n))) {
6699                napi_complete(n);
6700                goto out_unlock;
6701        }
6702
6703        if (n->gro_bitmask) {
6704                /* flush too old packets
6705                 * If HZ < 1000, flush all packets.
6706                 */
6707                napi_gro_flush(n, HZ >= 1000);
6708        }
6709
6710        gro_normal_list(n);
6711
6712        /* Some drivers may have called napi_schedule
6713         * prior to exhausting their budget.
6714         */
6715        if (unlikely(!list_empty(&n->poll_list))) {
6716                pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
6717                             n->dev ? n->dev->name : "backlog");
6718                goto out_unlock;
6719        }
6720
6721        list_add_tail(&n->poll_list, repoll);
6722
6723out_unlock:
6724        netpoll_poll_unlock(have);
6725
6726        return work;
6727}
6728
6729static __latent_entropy void net_rx_action(struct softirq_action *h)
6730{
6731        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6732        unsigned long time_limit = jiffies +
6733                usecs_to_jiffies(netdev_budget_usecs);
6734        int budget = netdev_budget;
6735        LIST_HEAD(list);
6736        LIST_HEAD(repoll);
6737
6738        local_irq_disable();
6739        list_splice_init(&sd->poll_list, &list);
6740        local_irq_enable();
6741
6742        for (;;) {
6743                struct napi_struct *n;
6744
6745                if (list_empty(&list)) {
6746                        if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
6747                                goto out;
6748                        break;
6749                }
6750
6751                n = list_first_entry(&list, struct napi_struct, poll_list);
6752                budget -= napi_poll(n, &repoll);
6753
6754                /* If softirq window is exhausted then punt.
6755                 * Allow this to run for 2 jiffies since which will allow
6756                 * an average latency of 1.5/HZ.
6757                 */
6758                if (unlikely(budget <= 0 ||
6759                             time_after_eq(jiffies, time_limit))) {
6760                        sd->time_squeeze++;
6761                        break;
6762                }
6763        }
6764
6765        local_irq_disable();
6766
6767        list_splice_tail_init(&sd->poll_list, &list);
6768        list_splice_tail(&repoll, &list);
6769        list_splice(&list, &sd->poll_list);
6770        if (!list_empty(&sd->poll_list))
6771                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
6772
6773        net_rps_action_and_irq_enable(sd);
6774out:
6775        __kfree_skb_flush();
6776}
6777
6778struct netdev_adjacent {
6779        struct net_device *dev;
6780
6781        /* upper master flag, there can only be one master device per list */
6782        bool master;
6783
6784        /* lookup ignore flag */
6785        bool ignore;
6786
6787        /* counter for the number of times this device was added to us */
6788        u16 ref_nr;
6789
6790        /* private field for the users */
6791        void *private;
6792
6793        struct list_head list;
6794        struct rcu_head rcu;
6795};
6796
6797static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
6798                                                 struct list_head *adj_list)
6799{
6800        struct netdev_adjacent *adj;
6801
6802        list_for_each_entry(adj, adj_list, list) {
6803                if (adj->dev == adj_dev)
6804                        return adj;
6805        }
6806        return NULL;
6807}
6808
6809static int ____netdev_has_upper_dev(struct net_device *upper_dev, void *data)
6810{
6811        struct net_device *dev = data;
6812
6813        return upper_dev == dev;
6814}
6815
6816/**
6817 * netdev_has_upper_dev - Check if device is linked to an upper device
6818 * @dev: device
6819 * @upper_dev: upper device to check
6820 *
6821 * Find out if a device is linked to specified upper device and return true
6822 * in case it is. Note that this checks only immediate upper device,
6823 * not through a complete stack of devices. The caller must hold the RTNL lock.
6824 */
6825bool netdev_has_upper_dev(struct net_device *dev,
6826                          struct net_device *upper_dev)
6827{
6828        ASSERT_RTNL();
6829
6830        return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6831                                             upper_dev);
6832}
6833EXPORT_SYMBOL(netdev_has_upper_dev);
6834
6835/**
6836 * netdev_has_upper_dev_all - Check if device is linked to an upper device
6837 * @dev: device
6838 * @upper_dev: upper device to check
6839 *
6840 * Find out if a device is linked to specified upper device and return true
6841 * in case it is. Note that this checks the entire upper device chain.
6842 * The caller must hold rcu lock.
6843 */
6844
6845bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
6846                                  struct net_device *upper_dev)
6847{
6848        return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6849                                               upper_dev);
6850}
6851EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
6852
6853/**
6854 * netdev_has_any_upper_dev - Check if device is linked to some device
6855 * @dev: device
6856 *
6857 * Find out if a device is linked to an upper device and return true in case
6858 * it is. The caller must hold the RTNL lock.
6859 */
6860bool netdev_has_any_upper_dev(struct net_device *dev)
6861{
6862        ASSERT_RTNL();
6863
6864        return !list_empty(&dev->adj_list.upper);
6865}
6866EXPORT_SYMBOL(netdev_has_any_upper_dev);
6867
6868/**
6869 * netdev_master_upper_dev_get - Get master upper device
6870 * @dev: device
6871 *
6872 * Find a master upper device and return pointer to it or NULL in case
6873 * it's not there. The caller must hold the RTNL lock.
6874 */
6875struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
6876{
6877        struct netdev_adjacent *upper;
6878
6879        ASSERT_RTNL();
6880
6881        if (list_empty(&dev->adj_list.upper))
6882                return NULL;
6883
6884        upper = list_first_entry(&dev->adj_list.upper,
6885                                 struct netdev_adjacent, list);
6886        if (likely(upper->master))
6887                return upper->dev;
6888        return NULL;
6889}
6890EXPORT_SYMBOL(netdev_master_upper_dev_get);
6891
6892static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
6893{
6894        struct netdev_adjacent *upper;
6895
6896        ASSERT_RTNL();
6897
6898        if (list_empty(&dev->adj_list.upper))
6899                return NULL;
6900
6901        upper = list_first_entry(&dev->adj_list.upper,
6902                                 struct netdev_adjacent, list);
6903        if (likely(upper->master) && !upper->ignore)
6904                return upper->dev;
6905        return NULL;
6906}
6907
6908/**
6909 * netdev_has_any_lower_dev - Check if device is linked to some device
6910 * @dev: device
6911 *
6912 * Find out if a device is linked to a lower device and return true in case
6913 * it is. The caller must hold the RTNL lock.
6914 */
6915static bool netdev_has_any_lower_dev(struct net_device *dev)
6916{
6917        ASSERT_RTNL();
6918
6919        return !list_empty(&dev->adj_list.lower);
6920}
6921
6922void *netdev_adjacent_get_private(struct list_head *adj_list)
6923{
6924        struct netdev_adjacent *adj;
6925
6926        adj = list_entry(adj_list, struct netdev_adjacent, list);
6927
6928        return adj->private;
6929}
6930EXPORT_SYMBOL(netdev_adjacent_get_private);
6931
6932/**
6933 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
6934 * @dev: device
6935 * @iter: list_head ** of the current position
6936 *
6937 * Gets the next device from the dev's upper list, starting from iter
6938 * position. The caller must hold RCU read lock.
6939 */
6940struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
6941                                                 struct list_head **iter)
6942{
6943        struct netdev_adjacent *upper;
6944
6945        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
6946
6947        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6948
6949        if (&upper->list == &dev->adj_list.upper)
6950                return NULL;
6951
6952        *iter = &upper->list;
6953
6954        return upper->dev;
6955}
6956EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
6957
6958static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
6959                                                  struct list_head **iter,
6960                                                  bool *ignore)
6961{
6962        struct netdev_adjacent *upper;
6963
6964        upper = list_entry((*iter)->next, struct netdev_adjacent, list);
6965
6966        if (&upper->list == &dev->adj_list.upper)
6967                return NULL;
6968
6969        *iter = &upper->list;
6970        *ignore = upper->ignore;
6971
6972        return upper->dev;
6973}
6974
6975static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
6976                                                    struct list_head **iter)
6977{
6978        struct netdev_adjacent *upper;
6979
6980        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
6981
6982        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6983
6984        if (&upper->list == &dev->adj_list.upper)
6985                return NULL;
6986
6987        *iter = &upper->list;
6988
6989        return upper->dev;
6990}
6991
6992static int __netdev_walk_all_upper_dev(struct net_device *dev,
6993                                       int (*fn)(struct net_device *dev,
6994                                                 void *data),
6995                                       void *data)
6996{
6997        struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
6998        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
6999        int ret, cur = 0;
7000        bool ignore;

7001
7002        now = dev;
7003        iter = &dev->adj_list.upper;
7004
7005        while (1) {
7006                if (now != dev) {
7007                        ret = fn(now, data);
7008                        if (ret)
7009                                return ret;
7010                }
7011
7012                next = NULL;
7013                while (1) {
7014                        udev = __netdev_next_upper_dev(now, &iter, &ignore);
7015                        if (!udev)
7016                                break;
7017                        if (ignore)
7018                                continue;
7019
7020                        next = udev;
7021                        niter = &udev->adj_list.upper;
7022                        dev_stack[cur] = now;
7023                        iter_stack[cur++] = iter;
7024                        break;
7025                }
7026
7027                if (!next) {
7028                        if (!cur)
7029                                return 0;
7030                        next = dev_stack[--cur];
7031                        niter = iter_stack[cur];
7032                }
7033
7034                now = next;
7035                iter = niter;
7036        }
7037
7038        return 0;
7039}
7040
7041int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
7042                                  int (*fn)(struct net_device *dev,
7043                                            void *data),
7044                                  void *data)
7045{
7046        struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7047        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7048        int ret, cur = 0;
7049
7050        now = dev;
7051        iter = &dev->adj_list.upper;
7052
7053        while (1) {
7054                if (now != dev) {
7055                        ret = fn(now, data);
7056                        if (ret)
7057                                return ret;
7058                }
7059
7060                next = NULL;
7061                while (1) {
7062                        udev = netdev_next_upper_dev_rcu(now, &iter);
7063                        if (!udev)
7064                                break;
7065
7066                        next = udev;
7067                        niter = &udev->adj_list.upper;
7068                        dev_stack[cur] = now;
7069                        iter_stack[cur++] = iter;
7070                        break;
7071                }
7072
7073                if (!next) {
7074                        if (!cur)
7075                                return 0;
7076                        next = dev_stack[--cur];
7077                        niter = iter_stack[cur];
7078                }
7079
7080                now = next;
7081                iter = niter;
7082        }
7083
7084        return 0;
7085}
7086EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
7087
7088static bool __netdev_has_upper_dev(struct net_device *dev,
7089                                   struct net_device *upper_dev)
7090{
7091        ASSERT_RTNL();
7092
7093        return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
7094                                           upper_dev);
7095}
7096
7097/**
7098 * netdev_lower_get_next_private - Get the next ->private from the
7099 *                                 lower neighbour list
7100 * @dev: device
7101 * @iter: list_head ** of the current position
7102 *
7103 * Gets the next netdev_adjacent->private from the dev's lower neighbour
7104 * list, starting from iter position. The caller must hold either hold the
7105 * RTNL lock or its own locking that guarantees that the neighbour lower
7106 * list will remain unchanged.
7107 */
7108void *netdev_lower_get_next_private(struct net_device *dev,
7109                                    struct list_head **iter)
7110{
7111        struct netdev_adjacent *lower;
7112
7113        lower = list_entry(*iter, struct netdev_adjacent, list);
7114
7115        if (&lower->list == &dev->adj_list.lower)
7116                return NULL;
7117
7118        *iter = lower->list.next;
7119
7120        return lower->private;
7121}
7122EXPORT_SYMBOL(netdev_lower_get_next_private);
7123
7124/**
7125 * netdev_lower_get_next_private_rcu - Get the next ->private from the
7126 *                                     lower neighbour list, RCU
7127 *                                     variant
7128 * @dev: device
7129 * @iter: list_head ** of the current position
7130 *
7131 * Gets the next netdev_adjacent->private from the dev's lower neighbour
7132 * list, starting from iter position. The caller must hold RCU read lock.
7133 */
7134void *netdev_lower_get_next_private_rcu(struct net_device *dev,
7135                                        struct list_head **iter)
7136{
7137        struct netdev_adjacent *lower;
7138
7139        WARN_ON_ONCE(!rcu_read_lock_held());
7140
7141        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7142
7143        if (&lower->list == &dev->adj_list.lower)
7144                return NULL;
7145
7146        *iter = &lower->list;
7147
7148        return lower->private;
7149}
7150EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
7151
7152/**
7153 * netdev_lower_get_next - Get the next device from the lower neighbour
7154 *                         list
7155 * @dev: device
7156 * @iter: list_head ** of the current position
7157 *
7158 * Gets the next netdev_adjacent from the dev's lower neighbour
7159 * list, starting from iter position. The caller must hold RTNL lock or
7160 * its own locking that guarantees that the neighbour lower
7161 * list will remain unchanged.
7162 */
7163void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
7164{
7165        struct netdev_adjacent *lower;
7166
7167        lower = list_entry(*iter, struct netdev_adjacent, list);
7168
7169        if (&lower->list == &dev->adj_list.lower)
7170                return NULL;
7171
7172        *iter = lower->list.next;
7173
7174        return lower->dev;
7175}
7176EXPORT_SYMBOL(netdev_lower_get_next);
7177
7178static struct net_device *netdev_next_lower_dev(struct net_device *dev,
7179                                                struct list_head **iter)
7180{
7181        struct netdev_adjacent *lower;
7182
7183        lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7184
7185        if (&lower->list == &dev->adj_list.lower)
7186                return NULL;
7187
7188        *iter = &lower->list;
7189
7190        return lower->dev;
7191}
7192
7193static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
7194                                                  struct list_head **iter,
7195                                                  bool *ignore)
7196{
7197        struct netdev_adjacent *lower;
7198
7199        lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7200
7201        if (&lower->list == &dev->adj_list.lower)
7202                return NULL;
7203
7204        *iter = &lower->list;
7205        *ignore = lower->ignore;
7206
7207        return lower->dev;
7208}
7209
7210int netdev_walk_all_lower_dev(struct net_device *dev,
7211                              int (*fn)(struct net_device *dev,
7212                                        void *data),
7213                              void *data)
7214{
7215        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7216        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7217        int ret, cur = 0;
7218
7219        now = dev;
7220        iter = &dev->adj_list.lower;
7221
7222        while (1) {
7223                if (now != dev) {
7224                        ret = fn(now, data);
7225                        if (ret)
7226                                return ret;
7227                }
7228
7229                next = NULL;
7230                while (1) {
7231                        ldev = netdev_next_lower_dev(now, &iter);
7232                        if (!ldev)
7233                                break;
7234
7235                        next = ldev;
7236                        niter = &ldev->adj_list.lower;
7237                        dev_stack[cur] = now;
7238                        iter_stack[cur++] = iter;
7239                        break;
7240                }
7241
7242                if (!next) {
7243                        if (!cur)
7244                                return 0;
7245                        next = dev_stack[--cur];
7246                        niter = iter_stack[cur];
7247                }
7248
7249                now = next;
7250                iter = niter;
7251        }
7252
7253        return 0;
7254}
7255EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
7256
7257static int __netdev_walk_all_lower_dev(struct net_device *dev,
7258                                       int (*fn)(struct net_device *dev,
7259                                                 void *data),
7260                                       void *data)
7261{
7262        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7263        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7264        int ret, cur = 0;
7265        bool ignore;
7266
7267        now = dev;
7268        iter = &dev->adj_list.lower;
7269
7270        while (1) {
7271                if (now != dev) {
7272                        ret = fn(now, data);
7273                        if (ret)
7274                                return ret;
7275                }
7276
7277                next = NULL;
7278                while (1) {
7279                        ldev = __netdev_next_lower_dev(now, &iter, &ignore);
7280                        if (!ldev)
7281                                break;
7282                        if (ignore)
7283                                continue;
7284
7285                        next = ldev;
7286                        niter = &ldev->adj_list.lower;
7287                        dev_stack[cur] = now;
7288                        iter_stack[cur++] = iter;
7289                        break;
7290                }
7291
7292                if (!next) {
7293                        if (!cur)
7294                                return 0;
7295                        next = dev_stack[--cur];
7296                        niter = iter_stack[cur];
7297                }
7298
7299                now = next;
7300                iter = niter;
7301        }
7302
7303        return 0;
7304}
7305
7306struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
7307                                             struct list_head **iter)
7308{
7309        struct netdev_adjacent *lower;
7310
7311        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7312        if (&lower->list == &dev->adj_list.lower)
7313                return NULL;
7314
7315        *iter = &lower->list;
7316
7317        return lower->dev;
7318}
7319EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
7320
7321static u8 __netdev_upper_depth(struct net_device *dev)
7322{
7323        struct net_device *udev;
7324        struct list_head *iter;
7325        u8 max_depth = 0;
7326        bool ignore;
7327
7328        for (iter = &dev->adj_list.upper,
7329             udev = __netdev_next_upper_dev(dev, &iter, &ignore);
7330             udev;
7331             udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
7332                if (ignore)
7333                        continue;
7334                if (max_depth < udev->upper_level)
7335                        max_depth = udev->upper_level;
7336        }
7337
7338        return max_depth;
7339}
7340
7341static u8 __netdev_lower_depth(struct net_device *dev)
7342{
7343        struct net_device *ldev;
7344        struct list_head *iter;
7345        u8 max_depth = 0;
7346        bool ignore;
7347
7348        for (iter = &dev->adj_list.lower,
7349             ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
7350             ldev;
7351             ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
7352                if (ignore)
7353                        continue;
7354                if (max_depth < ldev->lower_level)
7355                        max_depth = ldev->lower_level;
7356        }
7357
7358        return max_depth;
7359}
7360
7361static int __netdev_update_upper_level(struct net_device *dev, void *data)
7362{
7363        dev->upper_level = __netdev_upper_depth(dev) + 1;
7364        return 0;
7365}
7366
7367static int __netdev_update_lower_level(struct net_device *dev, void *data)
7368{
7369        dev->lower_level = __netdev_lower_depth(dev) + 1;
7370        return 0;
7371}
7372
7373int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
7374                                  int (*fn)(struct net_device *dev,
7375                                            void *data),
7376                                  void *data)
7377{
7378        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7379        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7380        int ret, cur = 0;
7381
7382        now = dev;
7383        iter = &dev->adj_list.lower;
7384
7385        while (1) {
7386                if (now != dev) {
7387                        ret = fn(now, data);
7388                        if (ret)
7389                                return ret;
7390                }
7391
7392                next = NULL;
7393                while (1) {
7394                        ldev = netdev_next_lower_dev_rcu(now, &iter);
7395                        if (!ldev)
7396                                break;
7397
7398                        next = ldev;
7399                        niter = &ldev->adj_list.lower;
7400                        dev_stack[cur] = now;
7401                        iter_stack[cur++] = iter;
7402                        break;
7403                }
7404
7405                if (!next) {
7406                        if (!cur)
7407                                return 0;
7408                        next = dev_stack[--cur];
7409                        niter = iter_stack[cur];
7410                }
7411
7412                now = next;
7413                iter = niter;
7414        }
7415
7416        return 0;
7417}
7418EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
7419
7420/**
7421 * netdev_lower_get_first_private_rcu - Get the first ->private from the
7422 *                                     lower neighbour list, RCU
7423 *                                     variant
7424 * @dev: device
7425 *
7426 * Gets the first netdev_adjacent->private from the dev's lower neighbour
7427 * list. The caller must hold RCU read lock.
7428 */
7429void *netdev_lower_get_first_private_rcu(struct net_device *dev)
7430{
7431        struct netdev_adjacent *lower;
7432
7433        lower = list_first_or_null_rcu(&dev->adj_list.lower,
7434                        struct netdev_adjacent, list);
7435        if (lower)
7436                return lower->private;
7437        return NULL;
7438}
7439EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
7440
7441/**
7442 * netdev_master_upper_dev_get_rcu - Get master upper device
7443 * @dev: device
7444 *
7445 * Find a master upper device and return pointer to it or NULL in case
7446 * it's not there. The caller must hold the RCU read lock.
7447 */
7448struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
7449{
7450        struct netdev_adjacent *upper;
7451
7452        upper = list_first_or_null_rcu(&dev->adj_list.upper,
7453                                       struct netdev_adjacent, list);
7454        if (upper && likely(upper->master))
7455                return upper->dev;
7456        return NULL;
7457}
7458EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
7459
7460static int netdev_adjacent_sysfs_add(struct net_device *dev,
7461                              struct net_device *adj_dev,
7462                              struct list_head *dev_list)
7463{
7464        char linkname[IFNAMSIZ+7];
7465
7466        sprintf(linkname, dev_list == &dev->adj_list.upper ?
7467                "upper_%s" : "lower_%s", adj_dev->name);
7468        return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
7469                                 linkname);
7470}
7471static void netdev_adjacent_sysfs_del(struct net_device *dev,
7472                               char *name,
7473                               struct list_head *dev_list)
7474{
7475        char linkname[IFNAMSIZ+7];
7476
7477        sprintf(linkname, dev_list == &dev->adj_list.upper ?
7478                "upper_%s" : "lower_%s", name);
7479        sysfs_remove_link(&(dev->dev.kobj), linkname);
7480}
7481
7482static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
7483                                                 struct net_device *adj_dev,
7484                                                 struct list_head *dev_list)
7485{
7486        return (dev_list == &dev->adj_list.upper ||
7487                dev_list == &dev->adj_list.lower) &&
7488                net_eq(dev_net(dev), dev_net(adj_dev));
7489}
7490
7491static int __netdev_adjacent_dev_insert(struct net_device *dev,
7492                                        struct net_device *adj_dev,
7493                                        struct list_head *dev_list,
7494                                        void *private, bool master)
7495{
7496        struct netdev_adjacent *adj;
7497        int ret;
7498
7499        adj = __netdev_find_adj(adj_dev, dev_list);
7500
7501        if (adj) {
7502                adj->ref_nr += 1;
7503                pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
7504                         dev->name, adj_dev->name, adj->ref_nr);
7505
7506                return 0;
7507        }
7508
7509        adj = kmalloc(sizeof(*adj), GFP_KERNEL);
7510        if (!adj)
7511                return -ENOMEM;
7512
7513        adj->dev = adj_dev;
7514        adj->master = master;
7515        adj->ref_nr = 1;
7516        adj->private = private;
7517        adj->ignore = false;
7518        dev_hold(adj_dev);
7519
7520        pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
7521                 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
7522
7523        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
7524                ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
7525                if (ret)
7526                        goto free_adj;
7527        }
7528
7529        /* Ensure that master link is always the first item in list. */
7530        if (master) {
7531                ret = sysfs_create_link(&(dev->dev.kobj),
7532                                        &(adj_dev->dev.kobj), "master");
7533                if (ret)
7534                        goto remove_symlinks;
7535
7536                list_add_rcu(&adj->list, dev_list);
7537        } else {
7538                list_add_tail_rcu(&adj->list, dev_list);
7539        }
7540
7541        return 0;
7542
7543remove_symlinks:
7544        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7545                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7546free_adj:
7547        kfree(adj);
7548        dev_put(adj_dev);
7549
7550        return ret;
7551}
7552
7553static void __netdev_adjacent_dev_remove(struct net_device *dev,
7554                                         struct net_device *adj_dev,
7555                                         u16 ref_nr,
7556                                         struct list_head *dev_list)
7557{
7558        struct netdev_adjacent *adj;
7559
7560        pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
7561                 dev->name, adj_dev->name, ref_nr);
7562
7563        adj = __netdev_find_adj(adj_dev, dev_list);
7564
7565        if (!adj) {
7566                pr_err("Adjacency does not exist for device %s from %s\n",
7567                       dev->name, adj_dev->name);
7568                WARN_ON(1);
7569                return;
7570        }
7571
7572        if (adj->ref_nr > ref_nr) {
7573                pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
7574                         dev->name, adj_dev->name, ref_nr,
7575                         adj->ref_nr - ref_nr);
7576                adj->ref_nr -= ref_nr;
7577                return;
7578        }
7579
7580        if (adj->master)
7581                sysfs_remove_link(&(dev->dev.kobj), "master");
7582
7583        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7584                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7585
7586        list_del_rcu(&adj->list);
7587        pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
7588                 adj_dev->name, dev->name, adj_dev->name);
7589        dev_put(adj_dev);
7590        kfree_rcu(adj, rcu);
7591}
7592
7593static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
7594                                            struct net_device *upper_dev,
7595                                            struct list_head *up_list,
7596                                            struct list_head *down_list,
7597                                            void *private, bool master)
7598{
7599        int ret;
7600
7601        ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
7602                                           private, master);
7603        if (ret)
7604                return ret;
7605
7606        ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
7607                                           private, false);
7608        if (ret) {
7609                __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
7610                return ret;
7611        }
7612
7613        return 0;
7614}
7615
7616static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
7617                                               struct net_device *upper_dev,
7618                                               u16 ref_nr,
7619                                               struct list_head *up_list,
7620                                               struct list_head *down_list)
7621{
7622        __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
7623        __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
7624}
7625
7626static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
7627                                                struct net_device *upper_dev,
7628                                                void *private, bool master)
7629{
7630        return __netdev_adjacent_dev_link_lists(dev, upper_dev,
7631                                                &dev->adj_list.upper,
7632                                                &upper_dev->adj_list.lower,
7633                                                private, master);
7634}
7635
7636static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
7637                                                   struct net_device *upper_dev)
7638{
7639        __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
7640                                           &dev->adj_list.upper,
7641                                           &upper_dev->adj_list.lower);
7642}
7643
7644static int __netdev_upper_dev_link(struct net_device *dev,
7645                                   struct net_device *upper_dev, bool master,
7646                                   void *upper_priv, void *upper_info,
7647                                   struct netlink_ext_ack *extack)
7648{
7649        struct netdev_notifier_changeupper_info changeupper_info = {
7650                .info = {
7651                        .dev = dev,
7652                        .extack = extack,
7653                },
7654                .upper_dev = upper_dev,
7655                .master = master,
7656                .linking = true,
7657                .upper_info = upper_info,
7658        };
7659        struct net_device *master_dev;
7660        int ret = 0;
7661
7662        ASSERT_RTNL();
7663
7664        if (dev == upper_dev)
7665                return -EBUSY;
7666
7667        /* To prevent loops, check if dev is not upper device to upper_dev. */
7668        if (__netdev_has_upper_dev(upper_dev, dev))
7669                return -EBUSY;
7670
7671        if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
7672                return -EMLINK;
7673
7674        if (!master) {
7675                if (__netdev_has_upper_dev(dev, upper_dev))
7676                        return -EEXIST;
7677        } else {
7678                master_dev = __netdev_master_upper_dev_get(dev);
7679                if (master_dev)
7680                        return master_dev == upper_dev ? -EEXIST : -EBUSY;
7681        }
7682
7683        ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7684                                            &changeupper_info.info);
7685        ret = notifier_to_errno(ret);
7686        if (ret)
7687                return ret;
7688
7689        ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
7690                                                   master);
7691        if (ret)
7692                return ret;
7693
7694        ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7695                                            &changeupper_info.info);
7696        ret = notifier_to_errno(ret);
7697        if (ret)
7698                goto rollback;
7699
7700        __netdev_update_upper_level(dev, NULL);
7701        __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7702
7703        __netdev_update_lower_level(upper_dev, NULL);
7704        __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7705                                    NULL);
7706
7707        return 0;
7708
7709rollback:
7710        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7711
7712        return ret;
7713}
7714
7715/**
7716 * netdev_upper_dev_link - Add a link to the upper device
7717 * @dev: device
7718 * @upper_dev: new upper device
7719 * @extack: netlink extended ack
7720 *
7721 * Adds a link to device which is upper to this one. The caller must hold
7722 * the RTNL lock. On a failure a negative errno code is returned.
7723 * On success the reference counts are adjusted and the function
7724 * returns zero.
7725 */
7726int netdev_upper_dev_link(struct net_device *dev,
7727                          struct net_device *upper_dev,
7728                          struct netlink_ext_ack *extack)
7729{
7730        return __netdev_upper_dev_link(dev, upper_dev, false,
7731                                       NULL, NULL, extack);
7732}
7733EXPORT_SYMBOL(netdev_upper_dev_link);
7734
7735/**
7736 * netdev_master_upper_dev_link - Add a master link to the upper device
7737 * @dev: device
7738 * @upper_dev: new upper device
7739 * @upper_priv: upper device private
7740 * @upper_info: upper info to be passed down via notifier
7741 * @extack: netlink extended ack
7742 *
7743 * Adds a link to device which is upper to this one. In this case, only
7744 * one master upper device can be linked, although other non-master devices
7745 * might be linked as well. The caller must hold the RTNL lock.
7746 * On a failure a negative errno code is returned. On success the reference
7747 * counts are adjusted and the function returns zero.
7748 */
7749int netdev_master_upper_dev_link(struct net_device *dev,
7750                                 struct net_device *upper_dev,
7751                                 void *upper_priv, void *upper_info,
7752                                 struct netlink_ext_ack *extack)
7753{
7754        return __netdev_upper_dev_link(dev, upper_dev, true,
7755                                       upper_priv, upper_info, extack);
7756}
7757EXPORT_SYMBOL(netdev_master_upper_dev_link);
7758
7759/**
7760 * netdev_upper_dev_unlink - Removes a link to upper device
7761 * @dev: device
7762 * @upper_dev: new upper device
7763 *
7764 * Removes a link to device which is upper to this one. The caller must hold
7765 * the RTNL lock.
7766 */
7767void netdev_upper_dev_unlink(struct net_device *dev,
7768                             struct net_device *upper_dev)
7769{
7770        struct netdev_notifier_changeupper_info changeupper_info = {
7771                .info = {
7772                        .dev = dev,
7773                },
7774                .upper_dev = upper_dev,
7775                .linking = false,
7776        };
7777
7778        ASSERT_RTNL();
7779
7780        changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
7781
7782        call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7783                                      &changeupper_info.info);
7784
7785        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7786
7787        call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7788                                      &changeupper_info.info);
7789
7790        __netdev_update_upper_level(dev, NULL);
7791        __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7792
7793        __netdev_update_lower_level(upper_dev, NULL);
7794        __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7795                                    NULL);
7796}
7797EXPORT_SYMBOL(netdev_upper_dev_unlink);
7798
7799static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
7800                                      struct net_device *lower_dev,
7801                                      bool val)
7802{
7803        struct netdev_adjacent *adj;
7804
7805        adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
7806        if (adj)
7807                adj->ignore = val;
7808
7809        adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
7810        if (adj)
7811                adj->ignore = val;
7812}
7813
7814static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
7815                                        struct net_device *lower_dev)
7816{
7817        __netdev_adjacent_dev_set(upper_dev, lower_dev, true);
7818}
7819
7820static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
7821                                       struct net_device *lower_dev)
7822{
7823        __netdev_adjacent_dev_set(upper_dev, lower_dev, false);
7824}
7825
7826int netdev_adjacent_change_prepare(struct net_device *old_dev,
7827                                   struct net_device *new_dev,
7828                                   struct net_device *dev,
7829                                   struct netlink_ext_ack *extack)
7830{
7831        int err;
7832
7833        if (!new_dev)
7834                return 0;
7835
7836        if (old_dev && new_dev != old_dev)
7837                netdev_adjacent_dev_disable(dev, old_dev);
7838
7839        err = netdev_upper_dev_link(new_dev, dev, extack);
7840        if (err) {
7841                if (old_dev && new_dev != old_dev)
7842                        netdev_adjacent_dev_enable(dev, old_dev);
7843                return err;
7844        }
7845
7846        return 0;
7847}
7848EXPORT_SYMBOL(netdev_adjacent_change_prepare);
7849
7850void netdev_adjacent_change_commit(struct net_device *old_dev,
7851                                   struct net_device *new_dev,
7852                                   struct net_device *dev)
7853{
7854        if (!new_dev || !old_dev)
7855                return;
7856
7857        if (new_dev == old_dev)
7858                return;
7859
7860        netdev_adjacent_dev_enable(dev, old_dev);
7861        netdev_upper_dev_unlink(old_dev, dev);
7862}
7863EXPORT_SYMBOL(netdev_adjacent_change_commit);
7864
7865void netdev_adjacent_change_abort(struct net_device *old_dev,
7866                                  struct net_device *new_dev,
7867                                  struct net_device *dev)
7868{
7869        if (!new_dev)
7870                return;
7871
7872        if (old_dev && new_dev != old_dev)
7873                netdev_adjacent_dev_enable(dev, old_dev);
7874
7875        netdev_upper_dev_unlink(new_dev, dev);
7876}
7877EXPORT_SYMBOL(netdev_adjacent_change_abort);
7878
7879/**
7880 * netdev_bonding_info_change - Dispatch event about slave change
7881 * @dev: device
7882 * @bonding_info: info to dispatch
7883 *
7884 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
7885 * The caller must hold the RTNL lock.
7886 */
7887void netdev_bonding_info_change(struct net_device *dev,
7888                                struct netdev_bonding_info *bonding_info)
7889{
7890        struct netdev_notifier_bonding_info info = {
7891                .info.dev = dev,
7892        };
7893
7894        memcpy(&info.bonding_info, bonding_info,
7895               sizeof(struct netdev_bonding_info));
7896        call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
7897                                      &info.info);
7898}
7899EXPORT_SYMBOL(netdev_bonding_info_change);
7900
7901/**
7902 * netdev_get_xmit_slave - Get the xmit slave of master device
7903 * @skb: The packet
7904 * @all_slaves: assume all the slaves are active
7905 *
7906 * The reference counters are not incremented so the caller must be
7907 * careful with locks. The caller must hold RCU lock.
7908 * %NULL is returned if no slave is found.
7909 */
7910
7911struct net_device *netdev_get_xmit_slave(struct net_device *dev,
7912                                         struct sk_buff *skb,
7913                                         bool all_slaves)
7914{
7915        const struct net_device_ops *ops = dev->netdev_ops;
7916
7917        if (!ops->ndo_get_xmit_slave)
7918                return NULL;
7919        return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
7920}
7921EXPORT_SYMBOL(netdev_get_xmit_slave);
7922
7923static void netdev_adjacent_add_links(struct net_device *dev)
7924{
7925        struct netdev_adjacent *iter;
7926
7927        struct net *net = dev_net(dev);
7928
7929        list_for_each_entry(iter, &dev->adj_list.upper, list) {
7930                if (!net_eq(net, dev_net(iter->dev)))
7931                        continue;
7932                netdev_adjacent_sysfs_add(iter->dev, dev,
7933                                          &iter->dev->adj_list.lower);
7934                netdev_adjacent_sysfs_add(dev, iter->dev,
7935                                          &dev->adj_list.upper);
7936        }
7937
7938        list_for_each_entry(iter, &dev->adj_list.lower, list) {
7939                if (!net_eq(net, dev_net(iter->dev)))
7940                        continue;
7941                netdev_adjacent_sysfs_add(iter->dev, dev,
7942                                          &iter->dev->adj_list.upper);
7943                netdev_adjacent_sysfs_add(dev, iter->dev,
7944                                          &dev->adj_list.lower);
7945        }
7946}
7947
7948static void netdev_adjacent_del_links(struct net_device *dev)
7949{
7950        struct netdev_adjacent *iter;
7951
7952        struct net *net = dev_net(dev);
7953
7954        list_for_each_entry(iter, &dev->adj_list.upper, list) {
7955                if (!net_eq(net, dev_net(iter->dev)))
7956                        continue;
7957                netdev_adjacent_sysfs_del(iter->dev, dev->name,
7958                                          &iter->dev->adj_list.lower);
7959                netdev_adjacent_sysfs_del(dev, iter->dev->name,
7960                                          &dev->adj_list.upper);
7961        }
7962
7963        list_for_each_entry(iter, &dev->adj_list.lower, list) {
7964                if (!net_eq(net, dev_net(iter->dev)))
7965                        continue;
7966                netdev_adjacent_sysfs_del(iter->dev, dev->name,
7967                                          &iter->dev->adj_list.upper);
7968                netdev_adjacent_sysfs_del(dev, iter->dev->name,
7969                                          &dev->adj_list.lower);
7970        }
7971}
7972
7973void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
7974{
7975        struct netdev_adjacent *iter;
7976
7977        struct net *net = dev_net(dev);
7978
7979        list_for_each_entry(iter, &dev->adj_list.upper, list) {
7980                if (!net_eq(net, dev_net(iter->dev)))
7981                        continue;
7982                netdev_adjacent_sysfs_del(iter->dev, oldname,
7983                                          &iter->dev->adj_list.lower);
7984                netdev_adjacent_sysfs_add(iter->dev, dev,
7985                                          &iter->dev->adj_list.lower);
7986        }
7987
7988        list_for_each_entry(iter, &dev->adj_list.lower, list) {
7989                if (!net_eq(net, dev_net(iter->dev)))
7990                        continue;
7991                netdev_adjacent_sysfs_del(iter->dev, oldname,
7992                                          &iter->dev->adj_list.upper);
7993                netdev_adjacent_sysfs_add(iter->dev, dev,
7994                                          &iter->dev->adj_list.upper);
7995        }
7996}
7997
7998void *netdev_lower_dev_get_private(struct net_device *dev,
7999                                   struct net_device *lower_dev)
8000{

8001        struct netdev_adjacent *lower;
8002
8003        if (!lower_dev)
8004                return NULL;
8005        lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
8006        if (!lower)
8007                return NULL;
8008
8009        return lower->private;
8010}
8011EXPORT_SYMBOL(netdev_lower_dev_get_private);
8012
8013
8014/**
8015 * netdev_lower_change - Dispatch event about lower device state change
8016 * @lower_dev: device
8017 * @lower_state_info: state to dispatch
8018 *
8019 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
8020 * The caller must hold the RTNL lock.
8021 */
8022void netdev_lower_state_changed(struct net_device *lower_dev,
8023                                void *lower_state_info)
8024{
8025        struct netdev_notifier_changelowerstate_info changelowerstate_info = {
8026                .info.dev = lower_dev,
8027        };
8028
8029        ASSERT_RTNL();
8030        changelowerstate_info.lower_state_info = lower_state_info;
8031        call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
8032                                      &changelowerstate_info.info);
8033}
8034EXPORT_SYMBOL(netdev_lower_state_changed);
8035
8036static void dev_change_rx_flags(struct net_device *dev, int flags)
8037{
8038        const struct net_device_ops *ops = dev->netdev_ops;
8039
8040        if (ops->ndo_change_rx_flags)
8041                ops->ndo_change_rx_flags(dev, flags);
8042}
8043
8044static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
8045{
8046        unsigned int old_flags = dev->flags;
8047        kuid_t uid;
8048        kgid_t gid;
8049
8050        ASSERT_RTNL();
8051
8052        dev->flags |= IFF_PROMISC;
8053        dev->promiscuity += inc;
8054        if (dev->promiscuity == 0) {
8055                /*
8056                 * Avoid overflow.
8057                 * If inc causes overflow, untouch promisc and return error.
8058                 */
8059                if (inc < 0)
8060                        dev->flags &= ~IFF_PROMISC;
8061                else {
8062                        dev->promiscuity -= inc;
8063                        pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
8064                                dev->name);
8065                        return -EOVERFLOW;
8066                }
8067        }
8068        if (dev->flags != old_flags) {
8069                pr_info("device %s %s promiscuous mode\n",
8070                        dev->name,
8071                        dev->flags & IFF_PROMISC ? "entered" : "left");
8072                if (audit_enabled) {
8073                        current_uid_gid(&uid, &gid);
8074                        audit_log(audit_context(), GFP_ATOMIC,
8075                                  AUDIT_ANOM_PROMISCUOUS,
8076                                  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
8077                                  dev->name, (dev->flags & IFF_PROMISC),
8078                                  (old_flags & IFF_PROMISC),
8079                                  from_kuid(&init_user_ns, audit_get_loginuid(current)),
8080                                  from_kuid(&init_user_ns, uid),
8081                                  from_kgid(&init_user_ns, gid),
8082                                  audit_get_sessionid(current));
8083                }
8084
8085                dev_change_rx_flags(dev, IFF_PROMISC);
8086        }
8087        if (notify)
8088                __dev_notify_flags(dev, old_flags, IFF_PROMISC);
8089        return 0;
8090}
8091
8092/**
8093 *      dev_set_promiscuity     - update promiscuity count on a device
8094 *      @dev: device
8095 *      @inc: modifier
8096 *
8097 *      Add or remove promiscuity from a device. While the count in the device
8098 *      remains above zero the interface remains promiscuous. Once it hits zero
8099 *      the device reverts back to normal filtering operation. A negative inc
8100 *      value is used to drop promiscuity on the device.
8101 *      Return 0 if successful or a negative errno code on error.
8102 */
8103int dev_set_promiscuity(struct net_device *dev, int inc)
8104{
8105        unsigned int old_flags = dev->flags;
8106        int err;
8107
8108        err = __dev_set_promiscuity(dev, inc, true);
8109        if (err < 0)
8110                return err;
8111        if (dev->flags != old_flags)
8112                dev_set_rx_mode(dev);
8113        return err;
8114}
8115EXPORT_SYMBOL(dev_set_promiscuity);
8116
8117static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
8118{
8119        unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
8120
8121        ASSERT_RTNL();
8122
8123        dev->flags |= IFF_ALLMULTI;
8124        dev->allmulti += inc;
8125        if (dev->allmulti == 0) {
8126                /*
8127                 * Avoid overflow.
8128                 * If inc causes overflow, untouch allmulti and return error.
8129                 */
8130                if (inc < 0)
8131                        dev->flags &= ~IFF_ALLMULTI;
8132                else {
8133                        dev->allmulti -= inc;
8134                        pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
8135                                dev->name);
8136                        return -EOVERFLOW;
8137                }
8138        }
8139        if (dev->flags ^ old_flags) {
8140                dev_change_rx_flags(dev, IFF_ALLMULTI);
8141                dev_set_rx_mode(dev);
8142                if (notify)
8143                        __dev_notify_flags(dev, old_flags,
8144                                           dev->gflags ^ old_gflags);
8145        }
8146        return 0;
8147}
8148
8149/**
8150 *      dev_set_allmulti        - update allmulti count on a device
8151 *      @dev: device
8152 *      @inc: modifier
8153 *
8154 *      Add or remove reception of all multicast frames to a device. While the
8155 *      count in the device remains above zero the interface remains listening
8156 *      to all interfaces. Once it hits zero the device reverts back to normal
8157 *      filtering operation. A negative @inc value is used to drop the counter
8158 *      when releasing a resource needing all multicasts.
8159 *      Return 0 if successful or a negative errno code on error.
8160 */
8161
8162int dev_set_allmulti(struct net_device *dev, int inc)
8163{
8164        return __dev_set_allmulti(dev, inc, true);
8165}
8166EXPORT_SYMBOL(dev_set_allmulti);
8167
8168/*
8169 *      Upload unicast and multicast address lists to device and
8170 *      configure RX filtering. When the device doesn't support unicast
8171 *      filtering it is put in promiscuous mode while unicast addresses
8172 *      are present.
8173 */
8174void __dev_set_rx_mode(struct net_device *dev)
8175{
8176        const struct net_device_ops *ops = dev->netdev_ops;
8177
8178        /* dev_open will call this function so the list will stay sane. */
8179        if (!(dev->flags&IFF_UP))
8180                return;
8181
8182        if (!netif_device_present(dev))
8183                return;
8184
8185        if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
8186                /* Unicast addresses changes may only happen under the rtnl,
8187                 * therefore calling __dev_set_promiscuity here is safe.
8188                 */
8189                if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
8190                        __dev_set_promiscuity(dev, 1, false);
8191                        dev->uc_promisc = true;
8192                } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
8193                        __dev_set_promiscuity(dev, -1, false);
8194                        dev->uc_promisc = false;
8195                }
8196        }
8197
8198        if (ops->ndo_set_rx_mode)
8199                ops->ndo_set_rx_mode(dev);
8200}
8201
8202void dev_set_rx_mode(struct net_device *dev)
8203{
8204        netif_addr_lock_bh(dev);
8205        __dev_set_rx_mode(dev);
8206        netif_addr_unlock_bh(dev);
8207}
8208
8209/**
8210 *      dev_get_flags - get flags reported to userspace
8211 *      @dev: device
8212 *
8213 *      Get the combination of flag bits exported through APIs to userspace.
8214 */
8215unsigned int dev_get_flags(const struct net_device *dev)
8216{
8217        unsigned int flags;
8218
8219        flags = (dev->flags & ~(IFF_PROMISC |
8220                                IFF_ALLMULTI |
8221                                IFF_RUNNING |
8222                                IFF_LOWER_UP |
8223                                IFF_DORMANT)) |
8224                (dev->gflags & (IFF_PROMISC |
8225                                IFF_ALLMULTI));
8226
8227        if (netif_running(dev)) {
8228                if (netif_oper_up(dev))
8229                        flags |= IFF_RUNNING;
8230                if (netif_carrier_ok(dev))
8231                        flags |= IFF_LOWER_UP;
8232                if (netif_dormant(dev))
8233                        flags |= IFF_DORMANT;
8234        }
8235
8236        return flags;
8237}
8238EXPORT_SYMBOL(dev_get_flags);
8239
8240int __dev_change_flags(struct net_device *dev, unsigned int flags,
8241                       struct netlink_ext_ack *extack)
8242{
8243        unsigned int old_flags = dev->flags;
8244        int ret;
8245
8246        ASSERT_RTNL();
8247
8248        /*
8249         *      Set the flags on our device.
8250         */
8251
8252        dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
8253                               IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
8254                               IFF_AUTOMEDIA)) |
8255                     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
8256                                    IFF_ALLMULTI));
8257
8258        /*
8259         *      Load in the correct multicast list now the flags have changed.
8260         */
8261
8262        if ((old_flags ^ flags) & IFF_MULTICAST)
8263                dev_change_rx_flags(dev, IFF_MULTICAST);
8264
8265        dev_set_rx_mode(dev);
8266
8267        /*
8268         *      Have we downed the interface. We handle IFF_UP ourselves
8269         *      according to user attempts to set it, rather than blindly
8270         *      setting it.
8271         */
8272
8273        ret = 0;
8274        if ((old_flags ^ flags) & IFF_UP) {
8275                if (old_flags & IFF_UP)
8276                        __dev_close(dev);
8277                else
8278                        ret = __dev_open(dev, extack);
8279        }
8280
8281        if ((flags ^ dev->gflags) & IFF_PROMISC) {
8282                int inc = (flags & IFF_PROMISC) ? 1 : -1;
8283                unsigned int old_flags = dev->flags;
8284
8285                dev->gflags ^= IFF_PROMISC;
8286
8287                if (__dev_set_promiscuity(dev, inc, false) >= 0)
8288                        if (dev->flags != old_flags)
8289                                dev_set_rx_mode(dev);
8290        }
8291
8292        /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
8293         * is important. Some (broken) drivers set IFF_PROMISC, when
8294         * IFF_ALLMULTI is requested not asking us and not reporting.
8295         */
8296        if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
8297                int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
8298
8299                dev->gflags ^= IFF_ALLMULTI;
8300                __dev_set_allmulti(dev, inc, false);
8301        }
8302
8303        return ret;
8304}
8305
8306void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
8307                        unsigned int gchanges)
8308{
8309        unsigned int changes = dev->flags ^ old_flags;
8310
8311        if (gchanges)
8312                rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
8313
8314        if (changes & IFF_UP) {
8315                if (dev->flags & IFF_UP)
8316                        call_netdevice_notifiers(NETDEV_UP, dev);
8317                else
8318                        call_netdevice_notifiers(NETDEV_DOWN, dev);
8319        }
8320
8321        if (dev->flags & IFF_UP &&
8322            (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
8323                struct netdev_notifier_change_info change_info = {
8324                        .info = {
8325                                .dev = dev,
8326                        },
8327                        .flags_changed = changes,
8328                };
8329
8330                call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
8331        }
8332}
8333
8334/**
8335 *      dev_change_flags - change device settings
8336 *      @dev: device
8337 *      @flags: device state flags
8338 *      @extack: netlink extended ack
8339 *
8340 *      Change settings on device based state flags. The flags are
8341 *      in the userspace exported format.
8342 */
8343int dev_change_flags(struct net_device *dev, unsigned int flags,
8344                     struct netlink_ext_ack *extack)
8345{
8346        int ret;
8347        unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
8348
8349        ret = __dev_change_flags(dev, flags, extack);
8350        if (ret < 0)
8351                return ret;
8352
8353        changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
8354        __dev_notify_flags(dev, old_flags, changes);
8355        return ret;
8356}
8357EXPORT_SYMBOL(dev_change_flags);
8358
8359int __dev_set_mtu(struct net_device *dev, int new_mtu)
8360{
8361        const struct net_device_ops *ops = dev->netdev_ops;
8362
8363        if (ops->ndo_change_mtu)
8364                return ops->ndo_change_mtu(dev, new_mtu);
8365
8366        /* Pairs with all the lockless reads of dev->mtu in the stack */
8367        WRITE_ONCE(dev->mtu, new_mtu);
8368        return 0;
8369}
8370EXPORT_SYMBOL(__dev_set_mtu);
8371
8372int dev_validate_mtu(struct net_device *dev, int new_mtu,
8373                     struct netlink_ext_ack *extack)
8374{
8375        /* MTU must be positive, and in range */
8376        if (new_mtu < 0 || new_mtu < dev->min_mtu) {
8377                NL_SET_ERR_MSG(extack, "mtu less than device minimum");
8378                return -EINVAL;
8379        }
8380
8381        if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
8382                NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
8383                return -EINVAL;
8384        }
8385        return 0;
8386}
8387
8388/**
8389 *      dev_set_mtu_ext - Change maximum transfer unit
8390 *      @dev: device
8391 *      @new_mtu: new transfer unit
8392 *      @extack: netlink extended ack
8393 *
8394 *      Change the maximum transfer size of the network device.
8395 */
8396int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
8397                    struct netlink_ext_ack *extack)
8398{
8399        int err, orig_mtu;
8400
8401        if (new_mtu == dev->mtu)
8402                return 0;
8403
8404        err = dev_validate_mtu(dev, new_mtu, extack);
8405        if (err)
8406                return err;
8407
8408        if (!netif_device_present(dev))
8409                return -ENODEV;
8410
8411        err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
8412        err = notifier_to_errno(err);
8413        if (err)
8414                return err;
8415
8416        orig_mtu = dev->mtu;
8417        err = __dev_set_mtu(dev, new_mtu);
8418
8419        if (!err) {
8420                err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8421                                                   orig_mtu);
8422                err = notifier_to_errno(err);
8423                if (err) {
8424                        /* setting mtu back and notifying everyone again,
8425                         * so that they have a chance to revert changes.
8426                         */
8427                        __dev_set_mtu(dev, orig_mtu);
8428                        call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8429                                                     new_mtu);
8430                }
8431        }
8432        return err;
8433}
8434
8435int dev_set_mtu(struct net_device *dev, int new_mtu)
8436{
8437        struct netlink_ext_ack extack;
8438        int err;
8439
8440        memset(&extack, 0, sizeof(extack));
8441        err = dev_set_mtu_ext(dev, new_mtu, &extack);
8442        if (err && extack._msg)
8443                net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
8444        return err;
8445}
8446EXPORT_SYMBOL(dev_set_mtu);
8447
8448/**
8449 *      dev_change_tx_queue_len - Change TX queue length of a netdevice
8450 *      @dev: device
8451 *      @new_len: new tx queue length
8452 */
8453int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
8454{
8455        unsigned int orig_len = dev->tx_queue_len;
8456        int res;
8457
8458        if (new_len != (unsigned int)new_len)
8459                return -ERANGE;
8460
8461        if (new_len != orig_len) {
8462                dev->tx_queue_len = new_len;
8463                res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
8464                res = notifier_to_errno(res);
8465                if (res)
8466                        goto err_rollback;
8467                res = dev_qdisc_change_tx_queue_len(dev);
8468                if (res)
8469                        goto err_rollback;
8470        }
8471
8472        return 0;
8473
8474err_rollback:
8475        netdev_err(dev, "refused to change device tx_queue_len\n");
8476        dev->tx_queue_len = orig_len;
8477        return res;
8478}
8479
8480/**
8481 *      dev_set_group - Change group this device belongs to
8482 *      @dev: device
8483 *      @new_group: group this device should belong to
8484 */
8485void dev_set_group(struct net_device *dev, int new_group)
8486{
8487        dev->group = new_group;
8488}
8489EXPORT_SYMBOL(dev_set_group);
8490
8491/**
8492 *      dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
8493 *      @dev: device
8494 *      @addr: new address
8495 *      @extack: netlink extended ack
8496 */
8497int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
8498                              struct netlink_ext_ack *extack)
8499{
8500        struct netdev_notifier_pre_changeaddr_info info = {
8501                .info.dev = dev,
8502                .info.extack = extack,
8503                .dev_addr = addr,
8504        };
8505        int rc;
8506
8507        rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
8508        return notifier_to_errno(rc);
8509}
8510EXPORT_SYMBOL(dev_pre_changeaddr_notify);
8511
8512/**
8513 *      dev_set_mac_address - Change Media Access Control Address
8514 *      @dev: device
8515 *      @sa: new address
8516 *      @extack: netlink extended ack
8517 *
8518 *      Change the hardware (MAC) address of the device
8519 */
8520int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
8521                        struct netlink_ext_ack *extack)
8522{
8523        const struct net_device_ops *ops = dev->netdev_ops;
8524        int err;
8525
8526        if (!ops->ndo_set_mac_address)
8527                return -EOPNOTSUPP;
8528        if (sa->sa_family != dev->type)
8529                return -EINVAL;
8530        if (!netif_device_present(dev))
8531                return -ENODEV;
8532        err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
8533        if (err)
8534                return err;
8535        err = ops->ndo_set_mac_address(dev, sa);
8536        if (err)
8537                return err;
8538        dev->addr_assign_type = NET_ADDR_SET;
8539        call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
8540        add_device_randomness(dev->dev_addr, dev->addr_len);
8541        return 0;
8542}
8543EXPORT_SYMBOL(dev_set_mac_address);
8544
8545/**
8546 *      dev_change_carrier - Change device carrier
8547 *      @dev: device
8548 *      @new_carrier: new value
8549 *
8550 *      Change device carrier
8551 */
8552int dev_change_carrier(struct net_device *dev, bool new_carrier)
8553{
8554        const struct net_device_ops *ops = dev->netdev_ops;
8555
8556        if (!ops->ndo_change_carrier)
8557                return -EOPNOTSUPP;
8558        if (!netif_device_present(dev))
8559                return -ENODEV;
8560        return ops->ndo_change_carrier(dev, new_carrier);
8561}
8562EXPORT_SYMBOL(dev_change_carrier);
8563
8564/**
8565 *      dev_get_phys_port_id - Get device physical port ID
8566 *      @dev: device
8567 *      @ppid: port ID
8568 *
8569 *      Get device physical port ID
8570 */
8571int dev_get_phys_port_id(struct net_device *dev,
8572                         struct netdev_phys_item_id *ppid)
8573{
8574        const struct net_device_ops *ops = dev->netdev_ops;
8575
8576        if (!ops->ndo_get_phys_port_id)
8577                return -EOPNOTSUPP;
8578        return ops->ndo_get_phys_port_id(dev, ppid);
8579}
8580EXPORT_SYMBOL(dev_get_phys_port_id);
8581
8582/**
8583 *      dev_get_phys_port_name - Get device physical port name
8584 *      @dev: device
8585 *      @name: port name
8586 *      @len: limit of bytes to copy to name
8587 *
8588 *      Get device physical port name
8589 */
8590int dev_get_phys_port_name(struct net_device *dev,
8591                           char *name, size_t len)
8592{
8593        const struct net_device_ops *ops = dev->netdev_ops;
8594        int err;
8595
8596        if (ops->ndo_get_phys_port_name) {
8597                err = ops->ndo_get_phys_port_name(dev, name, len);
8598                if (err != -EOPNOTSUPP)
8599                        return err;
8600        }
8601        return devlink_compat_phys_port_name_get(dev, name, len);
8602}
8603EXPORT_SYMBOL(dev_get_phys_port_name);
8604
8605/**
8606 *      dev_get_port_parent_id - Get the device's port parent identifier
8607 *      @dev: network device
8608 *      @ppid: pointer to a storage for the port's parent identifier
8609 *      @recurse: allow/disallow recursion to lower devices
8610 *
8611 *      Get the devices's port parent identifier
8612 */
8613int dev_get_port_parent_id(struct net_device *dev,
8614                           struct netdev_phys_item_id *ppid,
8615                           bool recurse)
8616{
8617        const struct net_device_ops *ops = dev->netdev_ops;
8618        struct netdev_phys_item_id first = { };
8619        struct net_device *lower_dev;
8620        struct list_head *iter;
8621        int err;
8622
8623        if (ops->ndo_get_port_parent_id) {
8624                err = ops->ndo_get_port_parent_id(dev, ppid);
8625                if (err != -EOPNOTSUPP)
8626                        return err;
8627        }
8628
8629        err = devlink_compat_switch_id_get(dev, ppid);
8630        if (!err || err != -EOPNOTSUPP)
8631                return err;
8632
8633        if (!recurse)
8634                return -EOPNOTSUPP;
8635
8636        netdev_for_each_lower_dev(dev, lower_dev, iter) {
8637                err = dev_get_port_parent_id(lower_dev, ppid, recurse);
8638                if (err)
8639                        break;
8640                if (!first.id_len)
8641                        first = *ppid;
8642                else if (memcmp(&first, ppid, sizeof(*ppid)))
8643                        return -ENODATA;
8644        }
8645
8646        return err;
8647}
8648EXPORT_SYMBOL(dev_get_port_parent_id);
8649
8650/**
8651 *      netdev_port_same_parent_id - Indicate if two network devices have
8652 *      the same port parent identifier
8653 *      @a: first network device
8654 *      @b: second network device
8655 */
8656bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
8657{
8658        struct netdev_phys_item_id a_id = { };
8659        struct netdev_phys_item_id b_id = { };
8660
8661        if (dev_get_port_parent_id(a, &a_id, true) ||
8662            dev_get_port_parent_id(b, &b_id, true))
8663                return false;
8664
8665        return netdev_phys_item_id_same(&a_id, &b_id);
8666}
8667EXPORT_SYMBOL(netdev_port_same_parent_id);
8668
8669/**
8670 *      dev_change_proto_down - update protocol port state information
8671 *      @dev: device
8672 *      @proto_down: new value
8673 *
8674 *      This info can be used by switch drivers to set the phys state of the
8675 *      port.
8676 */
8677int dev_change_proto_down(struct net_device *dev, bool proto_down)
8678{
8679        const struct net_device_ops *ops = dev->netdev_ops;
8680
8681        if (!ops->ndo_change_proto_down)
8682                return -EOPNOTSUPP;
8683        if (!netif_device_present(dev))
8684                return -ENODEV;
8685        return ops->ndo_change_proto_down(dev, proto_down);
8686}
8687EXPORT_SYMBOL(dev_change_proto_down);
8688
8689/**
8690 *      dev_change_proto_down_generic - generic implementation for
8691 *      ndo_change_proto_down that sets carrier according to
8692 *      proto_down.
8693 *
8694 *      @dev: device
8695 *      @proto_down: new value
8696 */
8697int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
8698{
8699        if (proto_down)
8700                netif_carrier_off(dev);
8701        else
8702                netif_carrier_on(dev);
8703        dev->proto_down = proto_down;
8704        return 0;
8705}
8706EXPORT_SYMBOL(dev_change_proto_down_generic);
8707
8708u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
8709                    enum bpf_netdev_command cmd)
8710{
8711        struct netdev_bpf xdp;
8712
8713        if (!bpf_op)
8714                return 0;
8715
8716        memset(&xdp, 0, sizeof(xdp));
8717        xdp.command = cmd;
8718
8719        /* Query must always succeed. */
8720        WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
8721
8722        return xdp.prog_id;
8723}
8724
8725static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
8726                           struct netlink_ext_ack *extack, u32 flags,
8727                           struct bpf_prog *prog)
8728{
8729        bool non_hw = !(flags & XDP_FLAGS_HW_MODE);
8730        struct bpf_prog *prev_prog = NULL;
8731        struct netdev_bpf xdp;
8732        int err;
8733
8734        if (non_hw) {
8735                prev_prog = bpf_prog_by_id(__dev_xdp_query(dev, bpf_op,
8736                                                           XDP_QUERY_PROG));
8737                if (IS_ERR(prev_prog))
8738                        prev_prog = NULL;
8739        }
8740
8741        memset(&xdp, 0, sizeof(xdp));
8742        if (flags & XDP_FLAGS_HW_MODE)
8743                xdp.command = XDP_SETUP_PROG_HW;
8744        else
8745                xdp.command = XDP_SETUP_PROG;
8746        xdp.extack = extack;
8747        xdp.flags = flags;
8748        xdp.prog = prog;
8749
8750        err = bpf_op(dev, &xdp);
8751        if (!err && non_hw)
8752                bpf_prog_change_xdp(prev_prog, prog);
8753
8754        if (prev_prog)
8755                bpf_prog_put(prev_prog);
8756
8757        return err;
8758}
8759
8760static void dev_xdp_uninstall(struct net_device *dev)
8761{
8762        struct netdev_bpf xdp;
8763        bpf_op_t ndo_bpf;
8764
8765        /* Remove generic XDP */
8766        WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
8767
8768        /* Remove from the driver */
8769        ndo_bpf = dev->netdev_ops->ndo_bpf;
8770        if (!ndo_bpf)
8771                return;
8772
8773        memset(&xdp, 0, sizeof(xdp));
8774        xdp.command = XDP_QUERY_PROG;
8775        WARN_ON(ndo_bpf(dev, &xdp));
8776        if (xdp.prog_id)
8777                WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
8778                                        NULL));
8779
8780        /* Remove HW offload */
8781        memset(&xdp, 0, sizeof(xdp));
8782        xdp.command = XDP_QUERY_PROG_HW;
8783        if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
8784                WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
8785                                        NULL));
8786}
8787
8788/**
8789 *      dev_change_xdp_fd - set or clear a bpf program for a device rx path
8790 *      @dev: device
8791 *      @extack: netlink extended ack
8792 *      @fd: new program fd or negative value to clear
8793 *      @expected_fd: old program fd that userspace expects to replace or clear
8794 *      @flags: xdp-related flags
8795 *
8796 *      Set or clear a bpf program for a device
8797 */
8798int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
8799                      int fd, int expected_fd, u32 flags)
8800{
8801        const struct net_device_ops *ops = dev->netdev_ops;
8802        enum bpf_netdev_command query;
8803        u32 prog_id, expected_id = 0;
8804        bpf_op_t bpf_op, bpf_chk;
8805        struct bpf_prog *prog;
8806        bool offload;
8807        int err;
8808
8809        ASSERT_RTNL();
8810
8811        offload = flags & XDP_FLAGS_HW_MODE;
8812        query = offload ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
8813
8814        bpf_op = bpf_chk = ops->ndo_bpf;
8815        if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) {
8816                NL_SET_ERR_MSG(extack, "underlying driver does not support XDP in native mode");
8817                return -EOPNOTSUPP;
8818        }
8819        if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE))
8820                bpf_op = generic_xdp_install;
8821        if (bpf_op == bpf_chk)
8822                bpf_chk = generic_xdp_install;
8823
8824        prog_id = __dev_xdp_query(dev, bpf_op, query);
8825        if (flags & XDP_FLAGS_REPLACE) {
8826                if (expected_fd >= 0) {
8827                        prog = bpf_prog_get_type_dev(expected_fd,
8828                                                     BPF_PROG_TYPE_XDP,
8829                                                     bpf_op == ops->ndo_bpf);
8830                        if (IS_ERR(prog))
8831                                return PTR_ERR(prog);
8832                        expected_id = prog->aux->id;
8833                        bpf_prog_put(prog);
8834                }
8835
8836                if (prog_id != expected_id) {
8837                        NL_SET_ERR_MSG(extack, "Active program does not match expected");
8838                        return -EEXIST;
8839                }
8840        }
8841        if (fd >= 0) {
8842                if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) {
8843                        NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time");
8844                        return -EEXIST;
8845                }
8846
8847                if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && prog_id) {
8848                        NL_SET_ERR_MSG(extack, "XDP program already attached");
8849                        return -EBUSY;
8850                }
8851
8852                prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
8853                                             bpf_op == ops->ndo_bpf);
8854                if (IS_ERR(prog))
8855                        return PTR_ERR(prog);
8856
8857                if (!offload && bpf_prog_is_dev_bound(prog->aux)) {
8858                        NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported");
8859                        bpf_prog_put(prog);
8860                        return -EINVAL;
8861                }
8862
8863                if (prog->expected_attach_type == BPF_XDP_DEVMAP) {
8864                        NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
8865                        bpf_prog_put(prog);
8866                        return -EINVAL;
8867                }
8868
8869                /* prog->aux->id may be 0 for orphaned device-bound progs */
8870                if (prog->aux->id && prog->aux->id == prog_id) {
8871                        bpf_prog_put(prog);
8872                        return 0;
8873                }
8874        } else {
8875                if (!prog_id)
8876                        return 0;
8877                prog = NULL;
8878        }
8879
8880        err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
8881        if (err < 0 && prog)
8882                bpf_prog_put(prog);
8883
8884        return err;
8885}
8886
8887/**
8888 *      dev_new_index   -       allocate an ifindex
8889 *      @net: the applicable net namespace
8890 *
8891 *      Returns a suitable unique value for a new device interface
8892 *      number.  The caller must hold the rtnl semaphore or the
8893 *      dev_base_lock to be sure it remains unique.
8894 */
8895static int dev_new_index(struct net *net)
8896{
8897        int ifindex = net->ifindex;
8898
8899        for (;;) {
8900                if (++ifindex <= 0)
8901                        ifindex = 1;
8902                if (!__dev_get_by_index(net, ifindex))
8903                        return net->ifindex = ifindex;
8904        }
8905}
8906
8907/* Delayed registration/unregisteration */
8908static LIST_HEAD(net_todo_list);
8909DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
8910
8911static void net_set_todo(struct net_device *dev)
8912{
8913        list_add_tail(&dev->todo_list, &net_todo_list);
8914        dev_net(dev)->dev_unreg_count++;
8915}
8916
8917static void rollback_registered_many(struct list_head *head)
8918{
8919        struct net_device *dev, *tmp;
8920        LIST_HEAD(close_head);
8921
8922        BUG_ON(dev_boot_phase);
8923        ASSERT_RTNL();
8924
8925        list_for_each_entry_safe(dev, tmp, head, unreg_list) {
8926                /* Some devices call without registering
8927                 * for initialization unwind. Remove those
8928                 * devices and proceed with the remaining.
8929                 */
8930                if (dev->reg_state == NETREG_UNINITIALIZED) {
8931                        pr_debug("unregister_netdevice: device %s/%p never was registered\n",
8932                                 dev->name, dev);
8933
8934                        WARN_ON(1);
8935                        list_del(&dev->unreg_list);
8936                        continue;
8937                }
8938                dev->dismantle = true;
8939                BUG_ON(dev->reg_state != NETREG_REGISTERED);
8940        }
8941
8942        /* If device is running, close it first. */
8943        list_for_each_entry(dev, head, unreg_list)
8944                list_add_tail(&dev->close_list, &close_head);
8945        dev_close_many(&close_head, true);
8946
8947        list_for_each_entry(dev, head, unreg_list) {
8948                /* And unlink it from device chain. */
8949                unlist_netdevice(dev);
8950
8951                dev->reg_state = NETREG_UNREGISTERING;
8952        }
8953        flush_all_backlogs();
8954
8955        synchronize_net();
8956
8957        list_for_each_entry(dev, head, unreg_list) {
8958                struct sk_buff *skb = NULL;
8959
8960                /* Shutdown queueing discipline. */
8961                dev_shutdown(dev);
8962
8963                dev_xdp_uninstall(dev);
8964
8965                /* Notify protocols, that we are about to destroy
8966                 * this device. They should clean all the things.
8967                 */
8968                call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8969
8970                if (!dev->rtnl_link_ops ||
8971                    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
8972                        skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
8973                                                     GFP_KERNEL, NULL, 0);
8974
8975                /*
8976                 *      Flush the unicast and multicast chains
8977                 */
8978                dev_uc_flush(dev);
8979                dev_mc_flush(dev);
8980
8981                netdev_name_node_alt_flush(dev);
8982                netdev_name_node_free(dev->name_node);
8983
8984                if (dev->netdev_ops->ndo_uninit)
8985                        dev->netdev_ops->ndo_uninit(dev);
8986
8987                if (skb)
8988                        rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
8989
8990                /* Notifier chain MUST detach us all upper devices. */
8991                WARN_ON(netdev_has_any_upper_dev(dev));
8992                WARN_ON(netdev_has_any_lower_dev(dev));
8993
8994                /* Remove entries from kobject tree */
8995                netdev_unregister_kobject(dev);
8996#ifdef CONFIG_XPS
8997                /* Remove XPS queueing entries */
8998                netif_reset_xps_queues_gt(dev, 0);
8999#endif
9000        }

9001
9002        synchronize_net();
9003
9004        list_for_each_entry(dev, head, unreg_list)
9005                dev_put(dev);
9006}
9007
9008static void rollback_registered(struct net_device *dev)
9009{
9010        LIST_HEAD(single);
9011
9012        list_add(&dev->unreg_list, &single);
9013        rollback_registered_many(&single);
9014        list_del(&single);
9015}
9016
9017static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
9018        struct net_device *upper, netdev_features_t features)
9019{
9020        netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9021        netdev_features_t feature;
9022        int feature_bit;
9023
9024        for_each_netdev_feature(upper_disables, feature_bit) {
9025                feature = __NETIF_F_BIT(feature_bit);
9026                if (!(upper->wanted_features & feature)
9027                    && (features & feature)) {
9028                        netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
9029                                   &feature, upper->name);
9030                        features &= ~feature;
9031                }
9032        }
9033
9034        return features;
9035}
9036
9037static void netdev_sync_lower_features(struct net_device *upper,
9038        struct net_device *lower, netdev_features_t features)
9039{
9040        netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9041        netdev_features_t feature;
9042        int feature_bit;
9043
9044        for_each_netdev_feature(upper_disables, feature_bit) {
9045                feature = __NETIF_F_BIT(feature_bit);
9046                if (!(features & feature) && (lower->features & feature)) {
9047                        netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
9048                                   &feature, lower->name);
9049                        lower->wanted_features &= ~feature;
9050                        __netdev_update_features(lower);
9051
9052                        if (unlikely(lower->features & feature))
9053                                netdev_WARN(upper, "failed to disable %pNF on %s!\n",
9054                                            &feature, lower->name);
9055                        else
9056                                netdev_features_change(lower);
9057                }
9058        }
9059}
9060
9061static netdev_features_t netdev_fix_features(struct net_device *dev,
9062        netdev_features_t features)
9063{
9064        /* Fix illegal checksum combinations */
9065        if ((features & NETIF_F_HW_CSUM) &&
9066            (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
9067                netdev_warn(dev, "mixed HW and IP checksum settings.\n");
9068                features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
9069        }
9070
9071        /* TSO requires that SG is present as well. */
9072        if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
9073                netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
9074                features &= ~NETIF_F_ALL_TSO;
9075        }
9076
9077        if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
9078                                        !(features & NETIF_F_IP_CSUM)) {
9079                netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
9080                features &= ~NETIF_F_TSO;
9081                features &= ~NETIF_F_TSO_ECN;
9082        }
9083
9084        if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
9085                                         !(features & NETIF_F_IPV6_CSUM)) {
9086                netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
9087                features &= ~NETIF_F_TSO6;
9088        }
9089
9090        /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
9091        if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
9092                features &= ~NETIF_F_TSO_MANGLEID;
9093
9094        /* TSO ECN requires that TSO is present as well. */
9095        if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
9096                features &= ~NETIF_F_TSO_ECN;
9097
9098        /* Software GSO depends on SG. */
9099        if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
9100                netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
9101                features &= ~NETIF_F_GSO;
9102        }
9103
9104        /* GSO partial features require GSO partial be set */
9105        if ((features & dev->gso_partial_features) &&
9106            !(features & NETIF_F_GSO_PARTIAL)) {
9107                netdev_dbg(dev,
9108                           "Dropping partially supported GSO features since no GSO partial.\n");
9109                features &= ~dev->gso_partial_features;
9110        }
9111
9112        if (!(features & NETIF_F_RXCSUM)) {
9113                /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
9114                 * successfully merged by hardware must also have the
9115                 * checksum verified by hardware.  If the user does not
9116                 * want to enable RXCSUM, logically, we should disable GRO_HW.
9117                 */
9118                if (features & NETIF_F_GRO_HW) {
9119                        netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
9120                        features &= ~NETIF_F_GRO_HW;
9121                }
9122        }
9123
9124        /* LRO/HW-GRO features cannot be combined with RX-FCS */
9125        if (features & NETIF_F_RXFCS) {
9126                if (features & NETIF_F_LRO) {
9127                        netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
9128                        features &= ~NETIF_F_LRO;
9129                }
9130
9131                if (features & NETIF_F_GRO_HW) {
9132                        netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
9133                        features &= ~NETIF_F_GRO_HW;
9134                }
9135        }
9136
9137        return features;
9138}
9139
9140int __netdev_update_features(struct net_device *dev)
9141{
9142        struct net_device *upper, *lower;
9143        netdev_features_t features;
9144        struct list_head *iter;
9145        int err = -1;
9146
9147        ASSERT_RTNL();
9148
9149        features = netdev_get_wanted_features(dev);
9150
9151        if (dev->netdev_ops->ndo_fix_features)
9152                features = dev->netdev_ops->ndo_fix_features(dev, features);
9153
9154        /* driver might be less strict about feature dependencies */
9155        features = netdev_fix_features(dev, features);
9156
9157        /* some features can't be enabled if they're off an an upper device */
9158        netdev_for_each_upper_dev_rcu(dev, upper, iter)
9159                features = netdev_sync_upper_features(dev, upper, features);
9160
9161        if (dev->features == features)
9162                goto sync_lower;
9163
9164        netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
9165                &dev->features, &features);
9166
9167        if (dev->netdev_ops->ndo_set_features)
9168                err = dev->netdev_ops->ndo_set_features(dev, features);
9169        else
9170                err = 0;
9171
9172        if (unlikely(err < 0)) {
9173                netdev_err(dev,
9174                        "set_features() failed (%d); wanted %pNF, left %pNF\n",
9175                        err, &features, &dev->features);
9176                /* return non-0 since some features might have changed and
9177                 * it's better to fire a spurious notification than miss it
9178                 */
9179                return -1;
9180        }
9181
9182sync_lower:
9183        /* some features must be disabled on lower devices when disabled
9184         * on an upper device (think: bonding master or bridge)
9185         */
9186        netdev_for_each_lower_dev(dev, lower, iter)
9187                netdev_sync_lower_features(dev, lower, features);
9188
9189        if (!err) {
9190                netdev_features_t diff = features ^ dev->features;
9191
9192                if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
9193                        /* udp_tunnel_{get,drop}_rx_info both need
9194                         * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
9195                         * device, or they won't do anything.
9196                         * Thus we need to update dev->features
9197                         * *before* calling udp_tunnel_get_rx_info,
9198                         * but *after* calling udp_tunnel_drop_rx_info.
9199                         */
9200                        if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
9201                                dev->features = features;
9202                                udp_tunnel_get_rx_info(dev);
9203                        } else {
9204                                udp_tunnel_drop_rx_info(dev);
9205                        }
9206                }
9207
9208                if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
9209                        if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
9210                                dev->features = features;
9211                                err |= vlan_get_rx_ctag_filter_info(dev);
9212                        } else {
9213                                vlan_drop_rx_ctag_filter_info(dev);
9214                        }
9215                }
9216
9217                if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
9218                        if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
9219                                dev->features = features;
9220                                err |= vlan_get_rx_stag_filter_info(dev);
9221                        } else {
9222                                vlan_drop_rx_stag_filter_info(dev);
9223                        }
9224                }
9225
9226                dev->features = features;
9227        }
9228
9229        return err < 0 ? 0 : 1;
9230}
9231
9232/**
9233 *      netdev_update_features - recalculate device features
9234 *      @dev: the device to check
9235 *
9236 *      Recalculate dev->features set and send notifications if it
9237 *      has changed. Should be called after driver or hardware dependent
9238 *      conditions might have changed that influence the features.
9239 */
9240void netdev_update_features(struct net_device *dev)
9241{
9242        if (__netdev_update_features(dev))
9243                netdev_features_change(dev);
9244}
9245EXPORT_SYMBOL(netdev_update_features);
9246
9247/**
9248 *      netdev_change_features - recalculate device features
9249 *      @dev: the device to check
9250 *
9251 *      Recalculate dev->features set and send notifications even
9252 *      if they have not changed. Should be called instead of
9253 *      netdev_update_features() if also dev->vlan_features might
9254 *      have changed to allow the changes to be propagated to stacked
9255 *      VLAN devices.
9256 */
9257void netdev_change_features(struct net_device *dev)
9258{
9259        __netdev_update_features(dev);
9260        netdev_features_change(dev);
9261}
9262EXPORT_SYMBOL(netdev_change_features);
9263
9264/**
9265 *      netif_stacked_transfer_operstate -      transfer operstate
9266 *      @rootdev: the root or lower level device to transfer state from
9267 *      @dev: the device to transfer operstate to
9268 *
9269 *      Transfer operational state from root to device. This is normally
9270 *      called when a stacking relationship exists between the root
9271 *      device and the device(a leaf device).
9272 */
9273void netif_stacked_transfer_operstate(const struct net_device *rootdev,
9274                                        struct net_device *dev)
9275{
9276        if (rootdev->operstate == IF_OPER_DORMANT)
9277                netif_dormant_on(dev);
9278        else
9279                netif_dormant_off(dev);
9280
9281        if (rootdev->operstate == IF_OPER_TESTING)
9282                netif_testing_on(dev);
9283        else
9284                netif_testing_off(dev);
9285
9286        if (netif_carrier_ok(rootdev))
9287                netif_carrier_on(dev);
9288        else
9289                netif_carrier_off(dev);
9290}
9291EXPORT_SYMBOL(netif_stacked_transfer_operstate);
9292
9293static int netif_alloc_rx_queues(struct net_device *dev)
9294{
9295        unsigned int i, count = dev->num_rx_queues;
9296        struct netdev_rx_queue *rx;
9297        size_t sz = count * sizeof(*rx);
9298        int err = 0;
9299
9300        BUG_ON(count < 1);
9301
9302        rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
9303        if (!rx)
9304                return -ENOMEM;
9305
9306        dev->_rx = rx;
9307
9308        for (i = 0; i < count; i++) {
9309                rx[i].dev = dev;
9310
9311                /* XDP RX-queue setup */
9312                err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
9313                if (err < 0)
9314                        goto err_rxq_info;
9315        }
9316        return 0;
9317
9318err_rxq_info:
9319        /* Rollback successful reg's and free other resources */
9320        while (i--)
9321                xdp_rxq_info_unreg(&rx[i].xdp_rxq);
9322        kvfree(dev->_rx);
9323        dev->_rx = NULL;
9324        return err;
9325}
9326
9327static void netif_free_rx_queues(struct net_device *dev)
9328{
9329        unsigned int i, count = dev->num_rx_queues;
9330
9331        /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
9332        if (!dev->_rx)
9333                return;
9334
9335        for (i = 0; i < count; i++)
9336                xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
9337
9338        kvfree(dev->_rx);
9339}
9340
9341static void netdev_init_one_queue(struct net_device *dev,
9342                                  struct netdev_queue *queue, void *_unused)
9343{
9344        /* Initialize queue lock */
9345        spin_lock_init(&queue->_xmit_lock);
9346        netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
9347        queue->xmit_lock_owner = -1;
9348        netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
9349        queue->dev = dev;
9350#ifdef CONFIG_BQL
9351        dql_init(&queue->dql, HZ);
9352#endif
9353}
9354
9355static void netif_free_tx_queues(struct net_device *dev)
9356{
9357        kvfree(dev->_tx);
9358}
9359
9360static int netif_alloc_netdev_queues(struct net_device *dev)
9361{
9362        unsigned int count = dev->num_tx_queues;
9363        struct netdev_queue *tx;
9364        size_t sz = count * sizeof(*tx);
9365
9366        if (count < 1 || count > 0xffff)
9367                return -EINVAL;
9368
9369        tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
9370        if (!tx)
9371                return -ENOMEM;
9372
9373        dev->_tx = tx;
9374
9375        netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
9376        spin_lock_init(&dev->tx_global_lock);
9377
9378        return 0;
9379}
9380
9381void netif_tx_stop_all_queues(struct net_device *dev)
9382{
9383        unsigned int i;
9384
9385        for (i = 0; i < dev->num_tx_queues; i++) {
9386                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
9387
9388                netif_tx_stop_queue(txq);
9389        }
9390}
9391EXPORT_SYMBOL(netif_tx_stop_all_queues);
9392
9393/**
9394 *      register_netdevice      - register a network device
9395 *      @dev: device to register
9396 *
9397 *      Take a completed network device structure and add it to the kernel
9398 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
9399 *      chain. 0 is returned on success. A negative errno code is returned
9400 *      on a failure to set up the device, or if the name is a duplicate.
9401 *
9402 *      Callers must hold the rtnl semaphore. You may want
9403 *      register_netdev() instead of this.
9404 *
9405 *      BUGS:
9406 *      The locking appears insufficient to guarantee two parallel registers
9407 *      will not get the same name.
9408 */
9409
9410int register_netdevice(struct net_device *dev)
9411{
9412        int ret;
9413        struct net *net = dev_net(dev);
9414
9415        BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
9416                     NETDEV_FEATURE_COUNT);
9417        BUG_ON(dev_boot_phase);
9418        ASSERT_RTNL();
9419
9420        might_sleep();
9421
9422        /* When net_device's are persistent, this will be fatal. */
9423        BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
9424        BUG_ON(!net);
9425
9426        ret = ethtool_check_ops(dev->ethtool_ops);
9427        if (ret)
9428                return ret;
9429
9430        spin_lock_init(&dev->addr_list_lock);
9431        netdev_set_addr_lockdep_class(dev);
9432
9433        ret = dev_get_valid_name(net, dev, dev->name);
9434        if (ret < 0)
9435                goto out;
9436
9437        ret = -ENOMEM;
9438        dev->name_node = netdev_name_node_head_alloc(dev);
9439        if (!dev->name_node)
9440                goto out;
9441
9442        /* Init, if this function is available */
9443        if (dev->netdev_ops->ndo_init) {
9444                ret = dev->netdev_ops->ndo_init(dev);
9445                if (ret) {
9446                        if (ret > 0)
9447                                ret = -EIO;
9448                        goto err_free_name;
9449                }
9450        }
9451
9452        if (((dev->hw_features | dev->features) &
9453             NETIF_F_HW_VLAN_CTAG_FILTER) &&
9454            (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
9455             !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
9456                netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
9457                ret = -EINVAL;
9458                goto err_uninit;
9459        }
9460
9461        ret = -EBUSY;
9462        if (!dev->ifindex)
9463                dev->ifindex = dev_new_index(net);
9464        else if (__dev_get_by_index(net, dev->ifindex))
9465                goto err_uninit;
9466
9467        /* Transfer changeable features to wanted_features and enable
9468         * software offloads (GSO and GRO).
9469         */
9470        dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
9471        dev->features |= NETIF_F_SOFT_FEATURES;
9472
9473        if (dev->netdev_ops->ndo_udp_tunnel_add) {
9474                dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
9475                dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
9476        }
9477
9478        dev->wanted_features = dev->features & dev->hw_features;
9479
9480        if (!(dev->flags & IFF_LOOPBACK))
9481                dev->hw_features |= NETIF_F_NOCACHE_COPY;
9482
9483        /* If IPv4 TCP segmentation offload is supported we should also
9484         * allow the device to enable segmenting the frame with the option
9485         * of ignoring a static IP ID value.  This doesn't enable the
9486         * feature itself but allows the user to enable it later.
9487         */
9488        if (dev->hw_features & NETIF_F_TSO)
9489                dev->hw_features |= NETIF_F_TSO_MANGLEID;
9490        if (dev->vlan_features & NETIF_F_TSO)
9491                dev->vlan_features |= NETIF_F_TSO_MANGLEID;
9492        if (dev->mpls_features & NETIF_F_TSO)
9493                dev->mpls_features |= NETIF_F_TSO_MANGLEID;
9494        if (dev->hw_enc_features & NETIF_F_TSO)
9495                dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
9496
9497        /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
9498         */
9499        dev->vlan_features |= NETIF_F_HIGHDMA;
9500
9501        /* Make NETIF_F_SG inheritable to tunnel devices.
9502         */
9503        dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
9504
9505        /* Make NETIF_F_SG inheritable to MPLS.
9506         */
9507        dev->mpls_features |= NETIF_F_SG;
9508
9509        ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
9510        ret = notifier_to_errno(ret);
9511        if (ret)
9512                goto err_uninit;
9513
9514        ret = netdev_register_kobject(dev);
9515        if (ret) {
9516                dev->reg_state = NETREG_UNREGISTERED;
9517                goto err_uninit;
9518        }
9519        dev->reg_state = NETREG_REGISTERED;
9520
9521        __netdev_update_features(dev);
9522
9523        /*
9524         *      Default initial state at registry is that the
9525         *      device is present.
9526         */
9527
9528        set_bit(__LINK_STATE_PRESENT, &dev->state);
9529
9530        linkwatch_init_dev(dev);
9531
9532        dev_init_scheduler(dev);
9533        dev_hold(dev);
9534        list_netdevice(dev);
9535        add_device_randomness(dev->dev_addr, dev->addr_len);
9536
9537        /* If the device has permanent device address, driver should
9538         * set dev_addr and also addr_assign_type should be set to
9539         * NET_ADDR_PERM (default value).
9540         */
9541        if (dev->addr_assign_type == NET_ADDR_PERM)
9542                memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
9543
9544        /* Notify protocols, that a new device appeared. */
9545        ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
9546        ret = notifier_to_errno(ret);
9547        if (ret) {
9548                rollback_registered(dev);
9549                rcu_barrier();
9550
9551                dev->reg_state = NETREG_UNREGISTERED;
9552                /* We should put the kobject that hold in
9553                 * netdev_unregister_kobject(), otherwise
9554                 * the net device cannot be freed when
9555                 * driver calls free_netdev(), because the
9556                 * kobject is being hold.
9557                 */
9558                kobject_put(&dev->dev.kobj);
9559        }
9560        /*
9561         *      Prevent userspace races by waiting until the network
9562         *      device is fully setup before sending notifications.
9563         */
9564        if (!dev->rtnl_link_ops ||
9565            dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
9566                rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
9567
9568out:
9569        return ret;
9570
9571err_uninit:
9572        if (dev->netdev_ops->ndo_uninit)
9573                dev->netdev_ops->ndo_uninit(dev);
9574        if (dev->priv_destructor)
9575                dev->priv_destructor(dev);
9576err_free_name:
9577        netdev_name_node_free(dev->name_node);
9578        goto out;
9579}
9580EXPORT_SYMBOL(register_netdevice);
9581
9582/**
9583 *      init_dummy_netdev       - init a dummy network device for NAPI
9584 *      @dev: device to init
9585 *
9586 *      This takes a network device structure and initialize the minimum
9587 *      amount of fields so it can be used to schedule NAPI polls without
9588 *      registering a full blown interface. This is to be used by drivers
9589 *      that need to tie several hardware interfaces to a single NAPI
9590 *      poll scheduler due to HW limitations.
9591 */
9592int init_dummy_netdev(struct net_device *dev)
9593{
9594        /* Clear everything. Note we don't initialize spinlocks
9595         * are they aren't supposed to be taken by any of the
9596         * NAPI code and this dummy netdev is supposed to be
9597         * only ever used for NAPI polls
9598         */
9599        memset(dev, 0, sizeof(struct net_device));
9600
9601        /* make sure we BUG if trying to hit standard
9602         * register/unregister code path
9603         */
9604        dev->reg_state = NETREG_DUMMY;
9605
9606        /* NAPI wants this */
9607        INIT_LIST_HEAD(&dev->napi_list);
9608
9609        /* a dummy interface is started by default */
9610        set_bit(__LINK_STATE_PRESENT, &dev->state);
9611        set_bit(__LINK_STATE_START, &dev->state);
9612
9613        /* napi_busy_loop stats accounting wants this */
9614        dev_net_set(dev, &init_net);
9615
9616        /* Note : We dont allocate pcpu_refcnt for dummy devices,
9617         * because users of this 'device' dont need to change
9618         * its refcount.
9619         */
9620
9621        return 0;
9622}
9623EXPORT_SYMBOL_GPL(init_dummy_netdev);
9624
9625
9626/**
9627 *      register_netdev - register a network device
9628 *      @dev: device to register
9629 *
9630 *      Take a completed network device structure and add it to the kernel
9631 *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
9632 *      chain. 0 is returned on success. A negative errno code is returned
9633 *      on a failure to set up the device, or if the name is a duplicate.
9634 *
9635 *      This is a wrapper around register_netdevice that takes the rtnl semaphore
9636 *      and expands the device name if you passed a format string to
9637 *      alloc_netdev.
9638 */
9639int register_netdev(struct net_device *dev)
9640{
9641        int err;
9642
9643        if (rtnl_lock_killable())
9644                return -EINTR;
9645        err = register_netdevice(dev);
9646        rtnl_unlock();
9647        return err;
9648}
9649EXPORT_SYMBOL(register_netdev);
9650
9651int netdev_refcnt_read(const struct net_device *dev)
9652{
9653        int i, refcnt = 0;
9654
9655        for_each_possible_cpu(i)
9656                refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
9657        return refcnt;
9658}
9659EXPORT_SYMBOL(netdev_refcnt_read);
9660
9661/**
9662 * netdev_wait_allrefs - wait until all references are gone.
9663 * @dev: target net_device
9664 *
9665 * This is called when unregistering network devices.
9666 *
9667 * Any protocol or device that holds a reference should register
9668 * for netdevice notification, and cleanup and put back the
9669 * reference if they receive an UNREGISTER event.
9670 * We can get stuck here if buggy protocols don't correctly
9671 * call dev_put.
9672 */
9673static void netdev_wait_allrefs(struct net_device *dev)
9674{
9675        unsigned long rebroadcast_time, warning_time;
9676        int refcnt;
9677
9678        linkwatch_forget_dev(dev);
9679
9680        rebroadcast_time = warning_time = jiffies;
9681        refcnt = netdev_refcnt_read(dev);
9682
9683        while (refcnt != 0) {
9684                if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
9685                        rtnl_lock();
9686
9687                        /* Rebroadcast unregister notification */
9688                        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
9689
9690                        __rtnl_unlock();
9691                        rcu_barrier();
9692                        rtnl_lock();
9693
9694                        if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
9695                                     &dev->state)) {
9696                                /* We must not have linkwatch events
9697                                 * pending on unregister. If this
9698                                 * happens, we simply run the queue
9699                                 * unscheduled, resulting in a noop
9700                                 * for this device.
9701                                 */
9702                                linkwatch_run_queue();
9703                        }
9704
9705                        __rtnl_unlock();
9706
9707                        rebroadcast_time = jiffies;
9708                }
9709
9710                msleep(250);
9711
9712                refcnt = netdev_refcnt_read(dev);
9713
9714                if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
9715                        pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
9716                                 dev->name, refcnt);
9717                        warning_time = jiffies;
9718                }
9719        }
9720}
9721
9722/* The sequence is:
9723 *
9724 *      rtnl_lock();
9725 *      ...
9726 *      register_netdevice(x1);
9727 *      register_netdevice(x2);
9728 *      ...
9729 *      unregister_netdevice(y1);
9730 *      unregister_netdevice(y2);
9731 *      ...
9732 *      rtnl_unlock();
9733 *      free_netdev(y1);
9734 *      free_netdev(y2);
9735 *
9736 * We are invoked by rtnl_unlock().
9737 * This allows us to deal with problems:
9738 * 1) We can delete sysfs objects which invoke hotplug
9739 *    without deadlocking with linkwatch via keventd.
9740 * 2) Since we run with the RTNL semaphore not held, we can sleep
9741 *    safely in order to wait for the netdev refcnt to drop to zero.
9742 *
9743 * We must not return until all unregister events added during
9744 * the interval the lock was held have been completed.
9745 */
9746void netdev_run_todo(void)
9747{
9748        struct list_head list;
9749
9750        /* Snapshot list, allow later requests */
9751        list_replace_init(&net_todo_list, &list);
9752
9753        __rtnl_unlock();
9754
9755
9756        /* Wait for rcu callbacks to finish before next phase */
9757        if (!list_empty(&list))
9758                rcu_barrier();
9759
9760        while (!list_empty(&list)) {
9761                struct net_device *dev
9762                        = list_first_entry(&list, struct net_device, todo_list);
9763                list_del(&dev->todo_list);
9764
9765                if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
9766                        pr_err("network todo '%s' but state %d\n",
9767                               dev->name, dev->reg_state);
9768                        dump_stack();
9769                        continue;
9770                }
9771
9772                dev->reg_state = NETREG_UNREGISTERED;
9773
9774                netdev_wait_allrefs(dev);
9775
9776                /* paranoia */
9777                BUG_ON(netdev_refcnt_read(dev));
9778                BUG_ON(!list_empty(&dev->ptype_all));
9779                BUG_ON(!list_empty(&dev->ptype_specific));
9780                WARN_ON(rcu_access_pointer(dev->ip_ptr));
9781                WARN_ON(rcu_access_pointer(dev->ip6_ptr));
9782#if IS_ENABLED(CONFIG_DECNET)
9783                WARN_ON(dev->dn_ptr);
9784#endif
9785                if (dev->priv_destructor)
9786                        dev->priv_destructor(dev);
9787                if (dev->needs_free_netdev)
9788                        free_netdev(dev);
9789
9790                /* Report a network device has been unregistered */
9791                rtnl_lock();
9792                dev_net(dev)->dev_unreg_count--;
9793                __rtnl_unlock();
9794                wake_up(&netdev_unregistering_wq);
9795
9796                /* Free network device */
9797                kobject_put(&dev->dev.kobj);
9798        }
9799}
9800
9801/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
9802 * all the same fields in the same order as net_device_stats, with only
9803 * the type differing, but rtnl_link_stats64 may have additional fields
9804 * at the end for newer counters.
9805 */
9806void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
9807                             const struct net_device_stats *netdev_stats)
9808{
9809#if BITS_PER_LONG == 64
9810        BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
9811        memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
9812        /* zero out counters that only exist in rtnl_link_stats64 */
9813        memset((char *)stats64 + sizeof(*netdev_stats), 0,
9814               sizeof(*stats64) - sizeof(*netdev_stats));
9815#else
9816        size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
9817        const unsigned long *src = (const unsigned long *)netdev_stats;
9818        u64 *dst = (u64 *)stats64;
9819
9820        BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
9821        for (i = 0; i < n; i++)
9822                dst[i] = src[i];
9823        /* zero out counters that only exist in rtnl_link_stats64 */
9824        memset((char *)stats64 + n * sizeof(u64), 0,
9825               sizeof(*stats64) - n * sizeof(u64));
9826#endif
9827}
9828EXPORT_SYMBOL(netdev_stats_to_stats64);
9829
9830/**
9831 *      dev_get_stats   - get network device statistics
9832 *      @dev: device to get statistics from
9833 *      @storage: place to store stats
9834 *
9835 *      Get network statistics from device. Return @storage.
9836 *      The device driver may provide its own method by setting
9837 *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
9838 *      otherwise the internal statistics structure is used.
9839 */
9840struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
9841                                        struct rtnl_link_stats64 *storage)
9842{
9843        const struct net_device_ops *ops = dev->netdev_ops;
9844
9845        if (ops->ndo_get_stats64) {
9846                memset(storage, 0, sizeof(*storage));
9847                ops->ndo_get_stats64(dev, storage);
9848        } else if (ops->ndo_get_stats) {
9849                netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
9850        } else {
9851                netdev_stats_to_stats64(storage, &dev->stats);
9852        }
9853        storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
9854        storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
9855        storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
9856        return storage;
9857}
9858EXPORT_SYMBOL(dev_get_stats);
9859
9860struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
9861{
9862        struct netdev_queue *queue = dev_ingress_queue(dev);
9863
9864#ifdef CONFIG_NET_CLS_ACT
9865        if (queue)
9866                return queue;
9867        queue = kzalloc(sizeof(*queue), GFP_KERNEL);
9868        if (!queue)
9869                return NULL;
9870        netdev_init_one_queue(dev, queue, NULL);
9871        RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
9872        queue->qdisc_sleeping = &noop_qdisc;
9873        rcu_assign_pointer(dev->ingress_queue, queue);
9874#endif
9875        return queue;
9876}
9877
9878static const struct ethtool_ops default_ethtool_ops;
9879
9880void netdev_set_default_ethtool_ops(struct net_device *dev,
9881                                    const struct ethtool_ops *ops)
9882{
9883        if (dev->ethtool_ops == &default_ethtool_ops)
9884                dev->ethtool_ops = ops;
9885}
9886EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
9887
9888void netdev_freemem(struct net_device *dev)
9889{
9890        char *addr = (char *)dev - dev->padded;
9891
9892        kvfree(addr);
9893}
9894
9895/**
9896 * alloc_netdev_mqs - allocate network device
9897 * @sizeof_priv: size of private data to allocate space for
9898 * @name: device name format string
9899 * @name_assign_type: origin of device name
9900 * @setup: callback to initialize device
9901 * @txqs: the number of TX subqueues to allocate
9902 * @rxqs: the number of RX subqueues to allocate
9903 *
9904 * Allocates a struct net_device with private data area for driver use
9905 * and performs basic initialization.  Also allocates subqueue structs
9906 * for each queue on the device.
9907 */
9908struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
9909                unsigned char name_assign_type,
9910                void (*setup)(struct net_device *),
9911                unsigned int txqs, unsigned int rxqs)
9912{
9913        struct net_device *dev;
9914        unsigned int alloc_size;
9915        struct net_device *p;
9916
9917        BUG_ON(strlen(name) >= sizeof(dev->name));
9918
9919        if (txqs < 1) {
9920                pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
9921                return NULL;
9922        }
9923
9924        if (rxqs < 1) {
9925                pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
9926                return NULL;
9927        }
9928
9929        alloc_size = sizeof(struct net_device);
9930        if (sizeof_priv) {
9931                /* ensure 32-byte alignment of private area */
9932                alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
9933                alloc_size += sizeof_priv;
9934        }
9935        /* ensure 32-byte alignment of whole construct */
9936        alloc_size += NETDEV_ALIGN - 1;
9937
9938        p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
9939        if (!p)
9940                return NULL;
9941
9942        dev = PTR_ALIGN(p, NETDEV_ALIGN);
9943        dev->padded = (char *)dev - (char *)p;
9944
9945        dev->pcpu_refcnt = alloc_percpu(int);
9946        if (!dev->pcpu_refcnt)
9947                goto free_dev;
9948
9949        if (dev_addr_init(dev))
9950                goto free_pcpu;
9951
9952        dev_mc_init(dev);
9953        dev_uc_init(dev);
9954
9955        dev_net_set(dev, &init_net);
9956
9957        dev->gso_max_size = GSO_MAX_SIZE;
9958        dev->gso_max_segs = GSO_MAX_SEGS;
9959        dev->upper_level = 1;
9960        dev->lower_level = 1;
9961
9962        INIT_LIST_HEAD(&dev->napi_list);
9963        INIT_LIST_HEAD(&dev->unreg_list);
9964        INIT_LIST_HEAD(&dev->close_list);
9965        INIT_LIST_HEAD(&dev->link_watch_list);
9966        INIT_LIST_HEAD(&dev->adj_list.upper);
9967        INIT_LIST_HEAD(&dev->adj_list.lower);
9968        INIT_LIST_HEAD(&dev->ptype_all);
9969        INIT_LIST_HEAD(&dev->ptype_specific);
9970        INIT_LIST_HEAD(&dev->net_notifier_list);
9971#ifdef CONFIG_NET_SCHED
9972        hash_init(dev->qdisc_hash);
9973#endif
9974        dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
9975        setup(dev);
9976
9977        if (!dev->tx_queue_len) {
9978                dev->priv_flags |= IFF_NO_QUEUE;
9979                dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
9980        }
9981
9982        dev->num_tx_queues = txqs;
9983        dev->real_num_tx_queues = txqs;
9984        if (netif_alloc_netdev_queues(dev))
9985                goto free_all;
9986
9987        dev->num_rx_queues = rxqs;
9988        dev->real_num_rx_queues = rxqs;
9989        if (netif_alloc_rx_queues(dev))
9990                goto free_all;
9991
9992        strcpy(dev->name, name);
9993        dev->name_assign_type = name_assign_type;
9994        dev->group = INIT_NETDEV_GROUP;
9995        if (!dev->ethtool_ops)
9996                dev->ethtool_ops = &default_ethtool_ops;
9997
9998        nf_hook_ingress_init(dev);
9999
10000        return dev;

10001
10002free_all:
10003        free_netdev(dev);
10004        return NULL;
10005
10006free_pcpu:
10007        free_percpu(dev->pcpu_refcnt);
10008free_dev:
10009        netdev_freemem(dev);
10010        return NULL;
10011}
10012EXPORT_SYMBOL(alloc_netdev_mqs);
10013
10014/**
10015 * free_netdev - free network device
10016 * @dev: device
10017 *
10018 * This function does the last stage of destroying an allocated device
10019 * interface. The reference to the device object is released. If this
10020 * is the last reference then it will be freed.Must be called in process
10021 * context.
10022 */
10023void free_netdev(struct net_device *dev)
10024{
10025        struct napi_struct *p, *n;
10026
10027        might_sleep();
10028        netif_free_tx_queues(dev);
10029        netif_free_rx_queues(dev);
10030
10031        kfree(rcu_dereference_protected(dev->ingress_queue, 1));
10032
10033        /* Flush device addresses */
10034        dev_addr_flush(dev);
10035
10036        list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
10037                netif_napi_del(p);
10038
10039        free_percpu(dev->pcpu_refcnt);
10040        dev->pcpu_refcnt = NULL;
10041        free_percpu(dev->xdp_bulkq);
10042        dev->xdp_bulkq = NULL;
10043
10044        /*  Compatibility with error handling in drivers */
10045        if (dev->reg_state == NETREG_UNINITIALIZED) {
10046                netdev_freemem(dev);
10047                return;
10048        }
10049
10050        BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
10051        dev->reg_state = NETREG_RELEASED;
10052
10053        /* will free via device release */
10054        put_device(&dev->dev);
10055}
10056EXPORT_SYMBOL(free_netdev);
10057
10058/**
10059 *      synchronize_net -  Synchronize with packet receive processing
10060 *
10061 *      Wait for packets currently being received to be done.
10062 *      Does not block later packets from starting.
10063 */
10064void synchronize_net(void)
10065{
10066        might_sleep();
10067        if (rtnl_is_locked())
10068                synchronize_rcu_expedited();
10069        else
10070                synchronize_rcu();
10071}
10072EXPORT_SYMBOL(synchronize_net);
10073
10074/**
10075 *      unregister_netdevice_queue - remove device from the kernel
10076 *      @dev: device
10077 *      @head: list
10078 *
10079 *      This function shuts down a device interface and removes it
10080 *      from the kernel tables.
10081 *      If head not NULL, device is queued to be unregistered later.
10082 *
10083 *      Callers must hold the rtnl semaphore.  You may want
10084 *      unregister_netdev() instead of this.
10085 */
10086
10087void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
10088{
10089        ASSERT_RTNL();
10090
10091        if (head) {
10092                list_move_tail(&dev->unreg_list, head);
10093        } else {
10094                rollback_registered(dev);
10095                /* Finish processing unregister after unlock */
10096                net_set_todo(dev);
10097        }
10098}
10099EXPORT_SYMBOL(unregister_netdevice_queue);
10100
10101/**
10102 *      unregister_netdevice_many - unregister many devices
10103 *      @head: list of devices
10104 *
10105 *  Note: As most callers use a stack allocated list_head,
10106 *  we force a list_del() to make sure stack wont be corrupted later.
10107 */
10108void unregister_netdevice_many(struct list_head *head)
10109{
10110        struct net_device *dev;
10111
10112        if (!list_empty(head)) {
10113                rollback_registered_many(head);
10114                list_for_each_entry(dev, head, unreg_list)
10115                        net_set_todo(dev);
10116                list_del(head);
10117        }
10118}
10119EXPORT_SYMBOL(unregister_netdevice_many);
10120
10121/**
10122 *      unregister_netdev - remove device from the kernel
10123 *      @dev: device
10124 *
10125 *      This function shuts down a device interface and removes it
10126 *      from the kernel tables.
10127 *
10128 *      This is just a wrapper for unregister_netdevice that takes
10129 *      the rtnl semaphore.  In general you want to use this and not
10130 *      unregister_netdevice.
10131 */
10132void unregister_netdev(struct net_device *dev)
10133{
10134        rtnl_lock();
10135        unregister_netdevice(dev);
10136        rtnl_unlock();
10137}
10138EXPORT_SYMBOL(unregister_netdev);
10139
10140/**
10141 *      dev_change_net_namespace - move device to different nethost namespace
10142 *      @dev: device
10143 *      @net: network namespace
10144 *      @pat: If not NULL name pattern to try if the current device name
10145 *            is already taken in the destination network namespace.
10146 *
10147 *      This function shuts down a device interface and moves it
10148 *      to a new network namespace. On success 0 is returned, on
10149 *      a failure a netagive errno code is returned.
10150 *
10151 *      Callers must hold the rtnl semaphore.
10152 */
10153
10154int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
10155{
10156        struct net *net_old = dev_net(dev);
10157        int err, new_nsid, new_ifindex;
10158
10159        ASSERT_RTNL();
10160
10161        /* Don't allow namespace local devices to be moved. */
10162        err = -EINVAL;
10163        if (dev->features & NETIF_F_NETNS_LOCAL)
10164                goto out;
10165
10166        /* Ensure the device has been registrered */
10167        if (dev->reg_state != NETREG_REGISTERED)
10168                goto out;
10169
10170        /* Get out if there is nothing todo */
10171        err = 0;
10172        if (net_eq(net_old, net))
10173                goto out;
10174
10175        /* Pick the destination device name, and ensure
10176         * we can use it in the destination network namespace.
10177         */
10178        err = -EEXIST;
10179        if (__dev_get_by_name(net, dev->name)) {
10180                /* We get here if we can't use the current device name */
10181                if (!pat)
10182                        goto out;
10183                err = dev_get_valid_name(net, dev, pat);
10184                if (err < 0)
10185                        goto out;
10186        }
10187
10188        /*
10189         * And now a mini version of register_netdevice unregister_netdevice.
10190         */
10191
10192        /* If device is running close it first. */
10193        dev_close(dev);
10194
10195        /* And unlink it from device chain */
10196        unlist_netdevice(dev);
10197
10198        synchronize_net();
10199
10200        /* Shutdown queueing discipline. */
10201        dev_shutdown(dev);
10202
10203        /* Notify protocols, that we are about to destroy
10204         * this device. They should clean all the things.
10205         *
10206         * Note that dev->reg_state stays at NETREG_REGISTERED.
10207         * This is wanted because this way 8021q and macvlan know
10208         * the device is just moving and can keep their slaves up.
10209         */
10210        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10211        rcu_barrier();
10212
10213        new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
10214        /* If there is an ifindex conflict assign a new one */
10215        if (__dev_get_by_index(net, dev->ifindex))
10216                new_ifindex = dev_new_index(net);
10217        else
10218                new_ifindex = dev->ifindex;
10219
10220        rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
10221                            new_ifindex);
10222
10223        /*
10224         *      Flush the unicast and multicast chains
10225         */
10226        dev_uc_flush(dev);
10227        dev_mc_flush(dev);
10228
10229        /* Send a netdev-removed uevent to the old namespace */
10230        kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
10231        netdev_adjacent_del_links(dev);
10232
10233        /* Move per-net netdevice notifiers that are following the netdevice */
10234        move_netdevice_notifiers_dev_net(dev, net);
10235
10236        /* Actually switch the network namespace */
10237        dev_net_set(dev, net);
10238        dev->ifindex = new_ifindex;
10239
10240        /* Send a netdev-add uevent to the new namespace */
10241        kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
10242        netdev_adjacent_add_links(dev);
10243
10244        /* Fixup kobjects */
10245        err = device_rename(&dev->dev, dev->name);
10246        WARN_ON(err);
10247
10248        /* Adapt owner in case owning user namespace of target network
10249         * namespace is different from the original one.
10250         */
10251        err = netdev_change_owner(dev, net_old, net);
10252        WARN_ON(err);
10253
10254        /* Add the device back in the hashes */
10255        list_netdevice(dev);
10256
10257        /* Notify protocols, that a new device appeared. */
10258        call_netdevice_notifiers(NETDEV_REGISTER, dev);
10259
10260        /*
10261         *      Prevent userspace races by waiting until the network
10262         *      device is fully setup before sending notifications.
10263         */
10264        rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
10265
10266        synchronize_net();
10267        err = 0;
10268out:
10269        return err;
10270}
10271EXPORT_SYMBOL_GPL(dev_change_net_namespace);
10272
10273static int dev_cpu_dead(unsigned int oldcpu)
10274{
10275        struct sk_buff **list_skb;
10276        struct sk_buff *skb;
10277        unsigned int cpu;
10278        struct softnet_data *sd, *oldsd, *remsd = NULL;
10279
10280        local_irq_disable();
10281        cpu = smp_processor_id();
10282        sd = &per_cpu(softnet_data, cpu);
10283        oldsd = &per_cpu(softnet_data, oldcpu);
10284
10285        /* Find end of our completion_queue. */
10286        list_skb = &sd->completion_queue;
10287        while (*list_skb)
10288                list_skb = &(*list_skb)->next;
10289        /* Append completion queue from offline CPU. */
10290        *list_skb = oldsd->completion_queue;
10291        oldsd->completion_queue = NULL;
10292
10293        /* Append output queue from offline CPU. */
10294        if (oldsd->output_queue) {
10295                *sd->output_queue_tailp = oldsd->output_queue;
10296                sd->output_queue_tailp = oldsd->output_queue_tailp;
10297                oldsd->output_queue = NULL;
10298                oldsd->output_queue_tailp = &oldsd->output_queue;
10299        }
10300        /* Append NAPI poll list from offline CPU, with one exception :
10301         * process_backlog() must be called by cpu owning percpu backlog.
10302         * We properly handle process_queue & input_pkt_queue later.
10303         */
10304        while (!list_empty(&oldsd->poll_list)) {
10305                struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
10306                                                            struct napi_struct,
10307                                                            poll_list);
10308
10309                list_del_init(&napi->poll_list);
10310                if (napi->poll == process_backlog)
10311                        napi->state = 0;
10312                else
10313                        ____napi_schedule(sd, napi);
10314        }
10315
10316        raise_softirq_irqoff(NET_TX_SOFTIRQ);
10317        local_irq_enable();
10318
10319#ifdef CONFIG_RPS
10320        remsd = oldsd->rps_ipi_list;
10321        oldsd->rps_ipi_list = NULL;
10322#endif
10323        /* send out pending IPI's on offline CPU */
10324        net_rps_send_ipi(remsd);
10325
10326        /* Process offline CPU's input_pkt_queue */
10327        while ((skb = __skb_dequeue(&oldsd->process_queue))) {
10328                netif_rx_ni(skb);
10329                input_queue_head_incr(oldsd);
10330        }
10331        while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
10332                netif_rx_ni(skb);
10333                input_queue_head_incr(oldsd);
10334        }
10335
10336        return 0;
10337}
10338
10339/**
10340 *      netdev_increment_features - increment feature set by one
10341 *      @all: current feature set
10342 *      @one: new feature set
10343 *      @mask: mask feature set
10344 *
10345 *      Computes a new feature set after adding a device with feature set
10346 *      @one to the master device with current feature set @all.  Will not
10347 *      enable anything that is off in @mask. Returns the new feature set.
10348 */
10349netdev_features_t netdev_increment_features(netdev_features_t all,
10350        netdev_features_t one, netdev_features_t mask)
10351{
10352        if (mask & NETIF_F_HW_CSUM)
10353                mask |= NETIF_F_CSUM_MASK;
10354        mask |= NETIF_F_VLAN_CHALLENGED;
10355
10356        all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
10357        all &= one | ~NETIF_F_ALL_FOR_ALL;
10358
10359        /* If one device supports hw checksumming, set for all. */
10360        if (all & NETIF_F_HW_CSUM)
10361                all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
10362
10363        return all;
10364}
10365EXPORT_SYMBOL(netdev_increment_features);
10366
10367static struct hlist_head * __net_init netdev_create_hash(void)
10368{
10369        int i;
10370        struct hlist_head *hash;
10371
10372        hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
10373        if (hash != NULL)
10374                for (i = 0; i < NETDEV_HASHENTRIES; i++)
10375                        INIT_HLIST_HEAD(&hash[i]);
10376
10377        return hash;
10378}
10379
10380/* Initialize per network namespace state */
10381static int __net_init netdev_init(struct net *net)
10382{
10383        BUILD_BUG_ON(GRO_HASH_BUCKETS >
10384                     8 * sizeof_field(struct napi_struct, gro_bitmask));
10385
10386        if (net != &init_net)
10387                INIT_LIST_HEAD(&net->dev_base_head);
10388
10389        net->dev_name_head = netdev_create_hash();
10390        if (net->dev_name_head == NULL)
10391                goto err_name;
10392
10393        net->dev_index_head = netdev_create_hash();
10394        if (net->dev_index_head == NULL)
10395                goto err_idx;
10396
10397        RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
10398
10399        return 0;
10400
10401err_idx:
10402        kfree(net->dev_name_head);
10403err_name:
10404        return -ENOMEM;
10405}
10406
10407/**
10408 *      netdev_drivername - network driver for the device
10409 *      @dev: network device
10410 *
10411 *      Determine network driver for device.
10412 */
10413const char *netdev_drivername(const struct net_device *dev)
10414{
10415        const struct device_driver *driver;
10416        const struct device *parent;
10417        const char *empty = "";
10418
10419        parent = dev->dev.parent;
10420        if (!parent)
10421                return empty;
10422
10423        driver = parent->driver;
10424        if (driver && driver->name)
10425                return driver->name;
10426        return empty;
10427}
10428
10429static void __netdev_printk(const char *level, const struct net_device *dev,
10430                            struct va_format *vaf)
10431{
10432        if (dev && dev->dev.parent) {
10433                dev_printk_emit(level[1] - '0',
10434                                dev->dev.parent,
10435                                "%s %s %s%s: %pV",
10436                                dev_driver_string(dev->dev.parent),
10437                                dev_name(dev->dev.parent),
10438                                netdev_name(dev), netdev_reg_state(dev),
10439                                vaf);
10440        } else if (dev) {
10441                printk("%s%s%s: %pV",
10442                       level, netdev_name(dev), netdev_reg_state(dev), vaf);
10443        } else {
10444                printk("%s(NULL net_device): %pV", level, vaf);
10445        }
10446}
10447
10448void netdev_printk(const char *level, const struct net_device *dev,
10449                   const char *format, ...)
10450{
10451        struct va_format vaf;
10452        va_list args;
10453
10454        va_start(args, format);
10455
10456        vaf.fmt = format;
10457        vaf.va = &args;
10458
10459        __netdev_printk(level, dev, &vaf);
10460
10461        va_end(args);
10462}
10463EXPORT_SYMBOL(netdev_printk);
10464
10465#define define_netdev_printk_level(func, level)                 \
10466void func(const struct net_device *dev, const char *fmt, ...)   \
10467{                                                               \
10468        struct va_format vaf;                                   \
10469        va_list args;                                           \
10470                                                                \
10471        va_start(args, fmt);                                    \
10472                                                                \
10473        vaf.fmt = fmt;                                          \
10474        vaf.va = &args;                                         \
10475                                                                \
10476        __netdev_printk(level, dev, &vaf);                      \
10477                                                                \
10478        va_end(args);                                           \
10479}                                                               \
10480EXPORT_SYMBOL(func);
10481
10482define_netdev_printk_level(netdev_emerg, KERN_EMERG);
10483define_netdev_printk_level(netdev_alert, KERN_ALERT);
10484define_netdev_printk_level(netdev_crit, KERN_CRIT);
10485define_netdev_printk_level(netdev_err, KERN_ERR);
10486define_netdev_printk_level(netdev_warn, KERN_WARNING);
10487define_netdev_printk_level(netdev_notice, KERN_NOTICE);
10488define_netdev_printk_level(netdev_info, KERN_INFO);
10489
10490static void __net_exit netdev_exit(struct net *net)
10491{
10492        kfree(net->dev_name_head);
10493        kfree(net->dev_index_head);
10494        if (net != &init_net)
10495                WARN_ON_ONCE(!list_empty(&net->dev_base_head));
10496}
10497
10498static struct pernet_operations __net_initdata netdev_net_ops = {
10499        .init = netdev_init,
10500        .exit = netdev_exit,
10501};
10502
10503static void __net_exit default_device_exit(struct net *net)
10504{
10505        struct net_device *dev, *aux;
10506        /*
10507         * Push all migratable network devices back to the
10508         * initial network namespace
10509         */
10510        rtnl_lock();
10511        for_each_netdev_safe(net, dev, aux) {
10512                int err;
10513                char fb_name[IFNAMSIZ];
10514
10515                /* Ignore unmoveable devices (i.e. loopback) */
10516                if (dev->features & NETIF_F_NETNS_LOCAL)
10517                        continue;
10518
10519                /* Leave virtual devices for the generic cleanup */
10520                if (dev->rtnl_link_ops)
10521                        continue;
10522
10523                /* Push remaining network devices to init_net */
10524                snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
10525                if (__dev_get_by_name(&init_net, fb_name))
10526                        snprintf(fb_name, IFNAMSIZ, "dev%%d");
10527                err = dev_change_net_namespace(dev, &init_net, fb_name);
10528                if (err) {
10529                        pr_emerg("%s: failed to move %s to init_net: %d\n",
10530                                 __func__, dev->name, err);
10531                        BUG();
10532                }
10533        }
10534        rtnl_unlock();
10535}
10536
10537static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
10538{
10539        /* Return with the rtnl_lock held when there are no network
10540         * devices unregistering in any network namespace in net_list.
10541         */
10542        struct net *net;
10543        bool unregistering;
10544        DEFINE_WAIT_FUNC(wait, woken_wake_function);
10545
10546        add_wait_queue(&netdev_unregistering_wq, &wait);
10547        for (;;) {
10548                unregistering = false;
10549                rtnl_lock();
10550                list_for_each_entry(net, net_list, exit_list) {
10551                        if (net->dev_unreg_count > 0) {
10552                                unregistering = true;
10553                                break;
10554                        }
10555                }
10556                if (!unregistering)
10557                        break;
10558                __rtnl_unlock();
10559
10560                wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
10561        }
10562        remove_wait_queue(&netdev_unregistering_wq, &wait);
10563}
10564
10565static void __net_exit default_device_exit_batch(struct list_head *net_list)
10566{
10567        /* At exit all network devices most be removed from a network
10568         * namespace.  Do this in the reverse order of registration.
10569         * Do this across as many network namespaces as possible to
10570         * improve batching efficiency.
10571         */
10572        struct net_device *dev;
10573        struct net *net;
10574        LIST_HEAD(dev_kill_list);
10575
10576        /* To prevent network device cleanup code from dereferencing
10577         * loopback devices or network devices that have been freed
10578         * wait here for all pending unregistrations to complete,
10579         * before unregistring the loopback device and allowing the
10580         * network namespace be freed.
10581         *
10582         * The netdev todo list containing all network devices
10583         * unregistrations that happen in default_device_exit_batch
10584         * will run in the rtnl_unlock() at the end of
10585         * default_device_exit_batch.
10586         */
10587        rtnl_lock_unregistering(net_list);
10588        list_for_each_entry(net, net_list, exit_list) {
10589                for_each_netdev_reverse(net, dev) {
10590                        if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
10591                                dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
10592                        else
10593                                unregister_netdevice_queue(dev, &dev_kill_list);
10594                }
10595        }
10596        unregister_netdevice_many(&dev_kill_list);
10597        rtnl_unlock();
10598}
10599
10600static struct pernet_operations __net_initdata default_device_ops = {
10601        .exit = default_device_exit,
10602        .exit_batch = default_device_exit_batch,
10603};
10604
10605/*
10606 *      Initialize the DEV module. At boot time this walks the device list and
10607 *      unhooks any devices that fail to initialise (normally hardware not
10608 *      present) and leaves us with a valid list of present and active devices.
10609 *
10610 */
10611
10612/*
10613 *       This is called single threaded during boot, so no need
10614 *       to take the rtnl semaphore.
10615 */
10616static int __init net_dev_init(void)
10617{
10618        int i, rc = -ENOMEM;
10619
10620        BUG_ON(!dev_boot_phase);
10621
10622        if (dev_proc_init())
10623                goto out;
10624
10625        if (netdev_kobject_init())
10626                goto out;
10627
10628        INIT_LIST_HEAD(&ptype_all);
10629        for (i = 0; i < PTYPE_HASH_SIZE; i++)
10630                INIT_LIST_HEAD(&ptype_base[i]);
10631
10632        INIT_LIST_HEAD(&offload_base);
10633
10634        if (register_pernet_subsys(&netdev_net_ops))
10635                goto out;
10636
10637        /*
10638         *      Initialise the packet receive queues.
10639         */
10640
10641        for_each_possible_cpu(i) {
10642                struct work_struct *flush = per_cpu_ptr(&flush_works, i);
10643                struct softnet_data *sd = &per_cpu(softnet_data, i);
10644
10645                INIT_WORK(flush, flush_backlog);
10646
10647                skb_queue_head_init(&sd->input_pkt_queue);
10648                skb_queue_head_init(&sd->process_queue);
10649#ifdef CONFIG_XFRM_OFFLOAD
10650                skb_queue_head_init(&sd->xfrm_backlog);
10651#endif
10652                INIT_LIST_HEAD(&sd->poll_list);
10653                sd->output_queue_tailp = &sd->output_queue;
10654#ifdef CONFIG_RPS
10655                sd->csd.func = rps_trigger_softirq;
10656                sd->csd.info = sd;
10657                sd->cpu = i;
10658#endif
10659
10660                init_gro_hash(&sd->backlog);
10661                sd->backlog.poll = process_backlog;
10662                sd->backlog.weight = weight_p;
10663        }
10664
10665        dev_boot_phase = 0;
10666
10667        /* The loopback device is special if any other network devices
10668         * is present in a network namespace the loopback device must
10669         * be present. Since we now dynamically allocate and free the
10670         * loopback device ensure this invariant is maintained by
10671         * keeping the loopback device as the first device on the
10672         * list of network devices.  Ensuring the loopback devices
10673         * is the first device that appears and the last network device
10674         * that disappears.
10675         */
10676        if (register_pernet_device(&loopback_net_ops))
10677                goto out;
10678
10679        if (register_pernet_device(&default_device_ops))
10680                goto out;
10681
10682        open_softirq(NET_TX_SOFTIRQ, net_tx_action);
10683        open_softirq(NET_RX_SOFTIRQ, net_rx_action);
10684
10685        rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
10686                                       NULL, dev_cpu_dead);
10687        WARN_ON(rc < 0);
10688        rc = 0;
10689out:
10690        return rc;
10691}
10692
10693subsys_initcall(net_dev_init);
10694