linux/net/ipv4/ip_gre.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *      Linux NET3:     GRE over IP protocol decoder.
   4 *
   5 *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
   6 */
   7
   8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   9
  10#include <linux/capability.h>
  11#include <linux/module.h>
  12#include <linux/types.h>
  13#include <linux/kernel.h>
  14#include <linux/slab.h>
  15#include <linux/uaccess.h>
  16#include <linux/skbuff.h>
  17#include <linux/netdevice.h>
  18#include <linux/in.h>
  19#include <linux/tcp.h>
  20#include <linux/udp.h>
  21#include <linux/if_arp.h>
  22#include <linux/if_vlan.h>
  23#include <linux/init.h>
  24#include <linux/in6.h>
  25#include <linux/inetdevice.h>
  26#include <linux/igmp.h>
  27#include <linux/netfilter_ipv4.h>
  28#include <linux/etherdevice.h>
  29#include <linux/if_ether.h>
  30
  31#include <net/sock.h>
  32#include <net/ip.h>
  33#include <net/icmp.h>
  34#include <net/protocol.h>
  35#include <net/ip_tunnels.h>
  36#include <net/arp.h>
  37#include <net/checksum.h>
  38#include <net/dsfield.h>
  39#include <net/inet_ecn.h>
  40#include <net/xfrm.h>
  41#include <net/net_namespace.h>
  42#include <net/netns/generic.h>
  43#include <net/rtnetlink.h>
  44#include <net/gre.h>
  45#include <net/dst_metadata.h>
  46#include <net/erspan.h>
  47
  48/*
  49   Problems & solutions
  50   --------------------
  51
  52   1. The most important issue is detecting local dead loops.
  53   They would cause complete host lockup in transmit, which
  54   would be "resolved" by stack overflow or, if queueing is enabled,
  55   with infinite looping in net_bh.
  56
  57   We cannot track such dead loops during route installation,
  58   it is infeasible task. The most general solutions would be
  59   to keep skb->encapsulation counter (sort of local ttl),
  60   and silently drop packet when it expires. It is a good
  61   solution, but it supposes maintaining new variable in ALL
  62   skb, even if no tunneling is used.
  63
  64   Current solution: xmit_recursion breaks dead loops. This is a percpu
  65   counter, since when we enter the first ndo_xmit(), cpu migration is
  66   forbidden. We force an exit if this counter reaches RECURSION_LIMIT
  67
  68   2. Networking dead loops would not kill routers, but would really
  69   kill network. IP hop limit plays role of "t->recursion" in this case,
  70   if we copy it from packet being encapsulated to upper header.
  71   It is very good solution, but it introduces two problems:
  72
  73   - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
  74     do not work over tunnels.
  75   - traceroute does not work. I planned to relay ICMP from tunnel,
  76     so that this problem would be solved and traceroute output
  77     would even more informative. This idea appeared to be wrong:
  78     only Linux complies to rfc1812 now (yes, guys, Linux is the only
  79     true router now :-)), all routers (at least, in neighbourhood of mine)
  80     return only 8 bytes of payload. It is the end.
  81
  82   Hence, if we want that OSPF worked or traceroute said something reasonable,
  83   we should search for another solution.
  84
  85   One of them is to parse packet trying to detect inner encapsulation
  86   made by our node. It is difficult or even impossible, especially,
  87   taking into account fragmentation. TO be short, ttl is not solution at all.
  88
  89   Current solution: The solution was UNEXPECTEDLY SIMPLE.
  90   We force DF flag on tunnels with preconfigured hop limit,
  91   that is ALL. :-) Well, it does not remove the problem completely,
  92   but exponential growth of network traffic is changed to linear
  93   (branches, that exceed pmtu are pruned) and tunnel mtu
  94   rapidly degrades to value <68, where looping stops.
  95   Yes, it is not good if there exists a router in the loop,
  96   which does not force DF, even when encapsulating packets have DF set.
  97   But it is not our problem! Nobody could accuse us, we made
  98   all that we could make. Even if it is your gated who injected
  99   fatal route to network, even if it were you who configured
 100   fatal static route: you are innocent. :-)
 101
 102   Alexey Kuznetsov.
 103 */
 104
 105static bool log_ecn_error = true;
 106module_param(log_ecn_error, bool, 0644);
 107MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 108
 109static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 110static const struct header_ops ipgre_header_ops;
 111
 112static int ipgre_tunnel_init(struct net_device *dev);
 113static void erspan_build_header(struct sk_buff *skb,
 114                                u32 id, u32 index,
 115                                bool truncate, bool is_ipv4);
 116
 117static unsigned int ipgre_net_id __read_mostly;
 118static unsigned int gre_tap_net_id __read_mostly;
 119static unsigned int erspan_net_id __read_mostly;
 120
 121static int ipgre_err(struct sk_buff *skb, u32 info,
 122                     const struct tnl_ptk_info *tpi)
 123{
 124
 125        /* All the routers (except for Linux) return only
 126           8 bytes of packet payload. It means, that precise relaying of
 127           ICMP in the real Internet is absolutely infeasible.
 128
 129           Moreover, Cisco "wise men" put GRE key to the third word
 130           in GRE header. It makes impossible maintaining even soft
 131           state for keyed GRE tunnels with enabled checksum. Tell
 132           them "thank you".
 133
 134           Well, I wonder, rfc1812 was written by Cisco employee,
 135           what the hell these idiots break standards established
 136           by themselves???
 137           */
 138        struct net *net = dev_net(skb->dev);
 139        struct ip_tunnel_net *itn;
 140        const struct iphdr *iph;
 141        const int type = icmp_hdr(skb)->type;
 142        const int code = icmp_hdr(skb)->code;
 143        unsigned int data_len = 0;
 144        struct ip_tunnel *t;
 145
 146        if (tpi->proto == htons(ETH_P_TEB))
 147                itn = net_generic(net, gre_tap_net_id);
 148        else if (tpi->proto == htons(ETH_P_ERSPAN) ||
 149                 tpi->proto == htons(ETH_P_ERSPAN2))
 150                itn = net_generic(net, erspan_net_id);
 151        else
 152                itn = net_generic(net, ipgre_net_id);
 153
 154        iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
 155        t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
 156                             iph->daddr, iph->saddr, tpi->key);
 157
 158        if (!t)
 159                return -ENOENT;
 160
 161        switch (type) {
 162        default:
 163        case ICMP_PARAMETERPROB:
 164                return 0;
 165
 166        case ICMP_DEST_UNREACH:
 167                switch (code) {
 168                case ICMP_SR_FAILED:
 169                case ICMP_PORT_UNREACH:
 170                        /* Impossible event. */
 171                        return 0;
 172                default:
 173                        /* All others are translated to HOST_UNREACH.
 174                           rfc2003 contains "deep thoughts" about NET_UNREACH,
 175                           I believe they are just ether pollution. --ANK
 176                         */
 177                        break;
 178                }
 179                break;
 180
 181        case ICMP_TIME_EXCEEDED:
 182                if (code != ICMP_EXC_TTL)
 183                        return 0;
 184                data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
 185                break;
 186
 187        case ICMP_REDIRECT:
 188                break;
 189        }
 190
 191#if IS_ENABLED(CONFIG_IPV6)
 192       if (tpi->proto == htons(ETH_P_IPV6) &&
 193           !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
 194                                       type, data_len))
 195               return 0;
 196#endif
 197
 198        if (t->parms.iph.daddr == 0 ||
 199            ipv4_is_multicast(t->parms.iph.daddr))
 200                return 0;
 201
 202        if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
 203                return 0;
 204
 205        if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
 206                t->err_count++;
 207        else
 208                t->err_count = 1;
 209        t->err_time = jiffies;
 210
 211        return 0;
 212}
 213
 214static void gre_err(struct sk_buff *skb, u32 info)
 215{
 216        /* All the routers (except for Linux) return only
 217         * 8 bytes of packet payload. It means, that precise relaying of
 218         * ICMP in the real Internet is absolutely infeasible.
 219         *
 220         * Moreover, Cisco "wise men" put GRE key to the third word
 221         * in GRE header. It makes impossible maintaining even soft
 222         * state for keyed
 223         * GRE tunnels with enabled checksum. Tell them "thank you".
 224         *
 225         * Well, I wonder, rfc1812 was written by Cisco employee,
 226         * what the hell these idiots break standards established
 227         * by themselves???
 228         */
 229
 230        const struct iphdr *iph = (struct iphdr *)skb->data;
 231        const int type = icmp_hdr(skb)->type;
 232        const int code = icmp_hdr(skb)->code;
 233        struct tnl_ptk_info tpi;
 234
 235        if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP),
 236                             iph->ihl * 4) < 0)
 237                return;
 238
 239        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
 240                ipv4_update_pmtu(skb, dev_net(skb->dev), info,
 241                                 skb->dev->ifindex, IPPROTO_GRE);
 242                return;
 243        }
 244        if (type == ICMP_REDIRECT) {
 245                ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex,
 246                              IPPROTO_GRE);
 247                return;
 248        }
 249
 250        ipgre_err(skb, info, &tpi);
 251}
 252
 253static bool is_erspan_type1(int gre_hdr_len)
 254{
 255        /* Both ERSPAN type I (version 0) and type II (version 1) use
 256         * protocol 0x88BE, but the type I has only 4-byte GRE header,
 257         * while type II has 8-byte.
 258         */
 259        return gre_hdr_len == 4;
 260}
 261
 262static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 263                      int gre_hdr_len)
 264{
 265        struct net *net = dev_net(skb->dev);
 266        struct metadata_dst *tun_dst = NULL;
 267        struct erspan_base_hdr *ershdr;
 268        struct ip_tunnel_net *itn;
 269        struct ip_tunnel *tunnel;
 270        const struct iphdr *iph;
 271        struct erspan_md2 *md2;
 272        int ver;
 273        int len;
 274
 275        itn = net_generic(net, erspan_net_id);
 276        iph = ip_hdr(skb);
 277        if (is_erspan_type1(gre_hdr_len)) {
 278                ver = 0;
 279                tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
 280                                          tpi->flags | TUNNEL_NO_KEY,
 281                                          iph->saddr, iph->daddr, 0);
 282        } else {
 283                ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
 284                ver = ershdr->ver;
 285                tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
 286                                          tpi->flags | TUNNEL_KEY,
 287                                          iph->saddr, iph->daddr, tpi->key);
 288        }
 289
 290        if (tunnel) {
 291                if (is_erspan_type1(gre_hdr_len))
 292                        len = gre_hdr_len;
 293                else
 294                        len = gre_hdr_len + erspan_hdr_len(ver);
 295
 296                if (unlikely(!pskb_may_pull(skb, len)))
 297                        return PACKET_REJECT;
 298
 299                if (__iptunnel_pull_header(skb,
 300                                           len,
 301                                           htons(ETH_P_TEB),
 302                                           false, false) < 0)
 303                        goto drop;
 304
 305                if (tunnel->collect_md) {
 306                        struct erspan_metadata *pkt_md, *md;
 307                        struct ip_tunnel_info *info;
 308                        unsigned char *gh;
 309                        __be64 tun_id;
 310                        __be16 flags;
 311
 312                        tpi->flags |= TUNNEL_KEY;
 313                        flags = tpi->flags;
 314                        tun_id = key32_to_tunnel_id(tpi->key);
 315
 316                        tun_dst = ip_tun_rx_dst(skb, flags,
 317                                                tun_id, sizeof(*md));
 318                        if (!tun_dst)
 319                                return PACKET_REJECT;
 320
 321                        /* skb can be uncloned in __iptunnel_pull_header, so
 322                         * old pkt_md is no longer valid and we need to reset
 323                         * it
 324                         */
 325                        gh = skb_network_header(skb) +
 326                             skb_network_header_len(skb);
 327                        pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len +
 328                                                            sizeof(*ershdr));
 329                        md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
 330                        md->version = ver;
 331                        md2 = &md->u.md2;
 332                        memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE :
 333                                                       ERSPAN_V2_MDSIZE);
 334
 335                        info = &tun_dst->u.tun_info;
 336                        info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
 337                        info->options_len = sizeof(*md);
 338                }
 339
 340                skb_reset_mac_header(skb);
 341                ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
 342                return PACKET_RCVD;
 343        }
 344        return PACKET_REJECT;
 345
 346drop:
 347        kfree_skb(skb);
 348        return PACKET_RCVD;
 349}
 350
 351static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 352                       struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
 353{
 354        struct metadata_dst *tun_dst = NULL;
 355        const struct iphdr *iph;
 356        struct ip_tunnel *tunnel;
 357
 358        iph = ip_hdr(skb);
 359        tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
 360                                  iph->saddr, iph->daddr, tpi->key);
 361
 362        if (tunnel) {
 363                const struct iphdr *tnl_params;
 364
 365                if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
 366                                           raw_proto, false) < 0)
 367                        goto drop;
 368
 369                /* Special case for ipgre_header_parse(), which expects the
 370                 * mac_header to point to the outer IP header.
 371                 */
 372                if (tunnel->dev->header_ops == &ipgre_header_ops)
 373                        skb_pop_mac_header(skb);
 374                else
 375                        skb_reset_mac_header(skb);
 376
 377                tnl_params = &tunnel->parms.iph;
 378                if (tunnel->collect_md || tnl_params->daddr == 0) {
 379                        __be16 flags;
 380                        __be64 tun_id;
 381
 382                        flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
 383                        tun_id = key32_to_tunnel_id(tpi->key);
 384                        tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
 385                        if (!tun_dst)
 386                                return PACKET_REJECT;
 387                }
 388
 389                ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
 390                return PACKET_RCVD;
 391        }
 392        return PACKET_NEXT;
 393
 394drop:
 395        kfree_skb(skb);
 396        return PACKET_RCVD;
 397}
 398
 399static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 400                     int hdr_len)
 401{
 402        struct net *net = dev_net(skb->dev);
 403        struct ip_tunnel_net *itn;
 404        int res;
 405
 406        if (tpi->proto == htons(ETH_P_TEB))
 407                itn = net_generic(net, gre_tap_net_id);
 408        else
 409                itn = net_generic(net, ipgre_net_id);
 410
 411        res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
 412        if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
 413                /* ipgre tunnels in collect metadata mode should receive
 414                 * also ETH_P_TEB traffic.
 415                 */
 416                itn = net_generic(net, ipgre_net_id);
 417                res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
 418        }
 419        return res;
 420}
 421
 422static int gre_rcv(struct sk_buff *skb)
 423{
 424        struct tnl_ptk_info tpi;
 425        bool csum_err = false;
 426        int hdr_len;
 427
 428#ifdef CONFIG_NET_IPGRE_BROADCAST
 429        if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
 430                /* Looped back packet, drop it! */
 431                if (rt_is_output_route(skb_rtable(skb)))
 432                        goto drop;
 433        }
 434#endif
 435
 436        hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
 437        if (hdr_len < 0)
 438                goto drop;
 439
 440        if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
 441                     tpi.proto == htons(ETH_P_ERSPAN2))) {
 442                if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
 443                        return 0;
 444                goto out;
 445        }
 446
 447        if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
 448                return 0;
 449
 450out:
 451        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 452drop:
 453        kfree_skb(skb);
 454        return 0;
 455}
 456
 457static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
 458                       const struct iphdr *tnl_params,
 459                       __be16 proto)
 460{
 461        struct ip_tunnel *tunnel = netdev_priv(dev);
 462
 463        if (tunnel->parms.o_flags & TUNNEL_SEQ)
 464                tunnel->o_seqno++;
 465
 466        /* Push GRE header. */
 467        gre_build_header(skb, tunnel->tun_hlen,
 468                         tunnel->parms.o_flags, proto, tunnel->parms.o_key,
 469                         htonl(tunnel->o_seqno));
 470
 471        ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
 472}
 473
 474static int gre_handle_offloads(struct sk_buff *skb, bool csum)
 475{
 476        if (csum && skb_checksum_start(skb) < skb->data)
 477                return -EINVAL;
 478        return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
 479}
 480
 481static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 482                        __be16 proto)
 483{
 484        struct ip_tunnel *tunnel = netdev_priv(dev);
 485        struct ip_tunnel_info *tun_info;
 486        const struct ip_tunnel_key *key;
 487        int tunnel_hlen;
 488        __be16 flags;
 489
 490        tun_info = skb_tunnel_info(skb);
 491        if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
 492                     ip_tunnel_info_af(tun_info) != AF_INET))
 493                goto err_free_skb;
 494
 495        key = &tun_info->key;
 496        tunnel_hlen = gre_calc_hlen(key->tun_flags);
 497
 498        if (skb_cow_head(skb, dev->needed_headroom))
 499                goto err_free_skb;
 500
 501        /* Push Tunnel header. */
 502        if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
 503                goto err_free_skb;
 504
 505        flags = tun_info->key.tun_flags &
 506                (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
 507        gre_build_header(skb, tunnel_hlen, flags, proto,
 508                         tunnel_id_to_key32(tun_info->key.tun_id),
 509                         (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0);
 510
 511        ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
 512
 513        return;
 514
 515err_free_skb:
 516        kfree_skb(skb);
 517        dev->stats.tx_dropped++;
 518}
 519
 520static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 521{
 522        struct ip_tunnel *tunnel = netdev_priv(dev);
 523        struct ip_tunnel_info *tun_info;
 524        const struct ip_tunnel_key *key;
 525        struct erspan_metadata *md;
 526        bool truncate = false;
 527        __be16 proto;
 528        int tunnel_hlen;
 529        int version;
 530        int nhoff;
 531        int thoff;
 532
 533        tun_info = skb_tunnel_info(skb);
 534        if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
 535                     ip_tunnel_info_af(tun_info) != AF_INET))
 536                goto err_free_skb;
 537
 538        key = &tun_info->key;
 539        if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
 540                goto err_free_skb;
 541        if (tun_info->options_len < sizeof(*md))
 542                goto err_free_skb;
 543        md = ip_tunnel_info_opts(tun_info);
 544
 545        /* ERSPAN has fixed 8 byte GRE header */
 546        version = md->version;
 547        tunnel_hlen = 8 + erspan_hdr_len(version);
 548
 549        if (skb_cow_head(skb, dev->needed_headroom))
 550                goto err_free_skb;
 551
 552        if (gre_handle_offloads(skb, false))
 553                goto err_free_skb;
 554
 555        if (skb->len > dev->mtu + dev->hard_header_len) {
 556                pskb_trim(skb, dev->mtu + dev->hard_header_len);
 557                truncate = true;
 558        }
 559
 560        nhoff = skb_network_header(skb) - skb_mac_header(skb);
 561        if (skb->protocol == htons(ETH_P_IP) &&
 562            (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
 563                truncate = true;
 564
 565        thoff = skb_transport_header(skb) - skb_mac_header(skb);
 566        if (skb->protocol == htons(ETH_P_IPV6) &&
 567            (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff))
 568                truncate = true;
 569
 570        if (version == 1) {
 571                erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
 572                                    ntohl(md->u.index), truncate, true);
 573                proto = htons(ETH_P_ERSPAN);
 574        } else if (version == 2) {
 575                erspan_build_header_v2(skb,
 576                                       ntohl(tunnel_id_to_key32(key->tun_id)),
 577                                       md->u.md2.dir,
 578                                       get_hwid(&md->u.md2),
 579                                       truncate, true);
 580                proto = htons(ETH_P_ERSPAN2);
 581        } else {
 582                goto err_free_skb;
 583        }
 584
 585        gre_build_header(skb, 8, TUNNEL_SEQ,
 586                         proto, 0, htonl(tunnel->o_seqno++));
 587
 588        ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
 589
 590        return;
 591
 592err_free_skb:
 593        kfree_skb(skb);
 594        dev->stats.tx_dropped++;
 595}
 596
 597static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 598{
 599        struct ip_tunnel_info *info = skb_tunnel_info(skb);
 600        const struct ip_tunnel_key *key;
 601        struct rtable *rt;
 602        struct flowi4 fl4;
 603
 604        if (ip_tunnel_info_af(info) != AF_INET)
 605                return -EINVAL;
 606
 607        key = &info->key;
 608        ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src,
 609                            tunnel_id_to_key32(key->tun_id), key->tos, 0,
 610                            skb->mark, skb_get_hash(skb));
 611        rt = ip_route_output_key(dev_net(dev), &fl4);
 612        if (IS_ERR(rt))
 613                return PTR_ERR(rt);
 614
 615        ip_rt_put(rt);
 616        info->key.u.ipv4.src = fl4.saddr;
 617        return 0;
 618}
 619
 620static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
 621                              struct net_device *dev)
 622{
 623        struct ip_tunnel *tunnel = netdev_priv(dev);
 624        const struct iphdr *tnl_params;
 625
 626        if (!pskb_inet_may_pull(skb))
 627                goto free_skb;
 628
 629        if (tunnel->collect_md) {
 630                gre_fb_xmit(skb, dev, skb->protocol);
 631                return NETDEV_TX_OK;
 632        }
 633
 634        if (dev->header_ops) {
 635                if (skb_cow_head(skb, 0))
 636                        goto free_skb;
 637
 638                tnl_params = (const struct iphdr *)skb->data;
 639
 640                /* Pull skb since ip_tunnel_xmit() needs skb->data pointing
 641                 * to gre header.
 642                 */
 643                skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
 644                skb_reset_mac_header(skb);
 645        } else {
 646                if (skb_cow_head(skb, dev->needed_headroom))
 647                        goto free_skb;
 648
 649                tnl_params = &tunnel->parms.iph;
 650        }
 651
 652        if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
 653                goto free_skb;
 654
 655        __gre_xmit(skb, dev, tnl_params, skb->protocol);
 656        return NETDEV_TX_OK;
 657
 658free_skb:
 659        kfree_skb(skb);
 660        dev->stats.tx_dropped++;
 661        return NETDEV_TX_OK;
 662}
 663
 664static netdev_tx_t erspan_xmit(struct sk_buff *skb,
 665                               struct net_device *dev)
 666{
 667        struct ip_tunnel *tunnel = netdev_priv(dev);
 668        bool truncate = false;
 669        __be16 proto;
 670
 671        if (!pskb_inet_may_pull(skb))
 672                goto free_skb;
 673
 674        if (tunnel->collect_md) {
 675                erspan_fb_xmit(skb, dev);
 676                return NETDEV_TX_OK;
 677        }
 678
 679        if (gre_handle_offloads(skb, false))
 680                goto free_skb;
 681
 682        if (skb_cow_head(skb, dev->needed_headroom))
 683                goto free_skb;
 684
 685        if (skb->len > dev->mtu + dev->hard_header_len) {
 686                pskb_trim(skb, dev->mtu + dev->hard_header_len);
 687                truncate = true;
 688        }
 689
 690        /* Push ERSPAN header */
 691        if (tunnel->erspan_ver == 0) {
 692                proto = htons(ETH_P_ERSPAN);
 693                tunnel->parms.o_flags &= ~TUNNEL_SEQ;
 694        } else if (tunnel->erspan_ver == 1) {
 695                erspan_build_header(skb, ntohl(tunnel->parms.o_key),
 696                                    tunnel->index,
 697                                    truncate, true);
 698                proto = htons(ETH_P_ERSPAN);
 699        } else if (tunnel->erspan_ver == 2) {
 700                erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key),
 701                                       tunnel->dir, tunnel->hwid,
 702                                       truncate, true);
 703                proto = htons(ETH_P_ERSPAN2);
 704        } else {
 705                goto free_skb;
 706        }
 707
 708        tunnel->parms.o_flags &= ~TUNNEL_KEY;
 709        __gre_xmit(skb, dev, &tunnel->parms.iph, proto);
 710        return NETDEV_TX_OK;
 711
 712free_skb:
 713        kfree_skb(skb);
 714        dev->stats.tx_dropped++;
 715        return NETDEV_TX_OK;
 716}
 717
 718static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
 719                                struct net_device *dev)
 720{
 721        struct ip_tunnel *tunnel = netdev_priv(dev);
 722
 723        if (!pskb_inet_may_pull(skb))
 724                goto free_skb;
 725
 726        if (tunnel->collect_md) {
 727                gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
 728                return NETDEV_TX_OK;
 729        }
 730
 731        if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
 732                goto free_skb;
 733
 734        if (skb_cow_head(skb, dev->needed_headroom))
 735                goto free_skb;
 736
 737        __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
 738        return NETDEV_TX_OK;
 739
 740free_skb:
 741        kfree_skb(skb);
 742        dev->stats.tx_dropped++;
 743        return NETDEV_TX_OK;
 744}
 745
 746static void ipgre_link_update(struct net_device *dev, bool set_mtu)
 747{
 748        struct ip_tunnel *tunnel = netdev_priv(dev);
 749        int len;
 750
 751        len = tunnel->tun_hlen;
 752        tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
 753        len = tunnel->tun_hlen - len;
 754        tunnel->hlen = tunnel->hlen + len;
 755
 756        if (dev->header_ops)
 757                dev->hard_header_len += len;
 758        else
 759                dev->needed_headroom += len;
 760
 761        if (set_mtu)
 762                dev->mtu = max_t(int, dev->mtu - len, 68);
 763
 764        if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
 765                if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
 766                    tunnel->encap.type == TUNNEL_ENCAP_NONE) {
 767                        dev->features |= NETIF_F_GSO_SOFTWARE;
 768                        dev->hw_features |= NETIF_F_GSO_SOFTWARE;
 769                } else {
 770                        dev->features &= ~NETIF_F_GSO_SOFTWARE;
 771                        dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
 772                }
 773                dev->features |= NETIF_F_LLTX;
 774        } else {
 775                dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
 776                dev->features &= ~(NETIF_F_LLTX | NETIF_F_GSO_SOFTWARE);
 777        }
 778}
 779
 780static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p,
 781                            int cmd)
 782{
 783        int err;
 784
 785        if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
 786                if (p->iph.version != 4 || p->iph.protocol != IPPROTO_GRE ||
 787                    p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)) ||
 788                    ((p->i_flags | p->o_flags) & (GRE_VERSION | GRE_ROUTING)))
 789                        return -EINVAL;
 790        }
 791
 792        p->i_flags = gre_flags_to_tnl_flags(p->i_flags);
 793        p->o_flags = gre_flags_to_tnl_flags(p->o_flags);
 794
 795        err = ip_tunnel_ctl(dev, p, cmd);
 796        if (err)
 797                return err;
 798
 799        if (cmd == SIOCCHGTUNNEL) {
 800                struct ip_tunnel *t = netdev_priv(dev);
 801
 802                t->parms.i_flags = p->i_flags;
 803                t->parms.o_flags = p->o_flags;
 804
 805                if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
 806                        ipgre_link_update(dev, true);
 807        }
 808
 809        p->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags);
 810        p->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags);
 811        return 0;
 812}
 813
 814/* Nice toy. Unfortunately, useless in real life :-)
 815   It allows to construct virtual multiprotocol broadcast "LAN"
 816   over the Internet, provided multicast routing is tuned.
 817
 818
 819   I have no idea was this bicycle invented before me,
 820   so that I had to set ARPHRD_IPGRE to a random value.
 821   I have an impression, that Cisco could make something similar,
 822   but this feature is apparently missing in IOS<=11.2(8).
 823
 824   I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
 825   with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
 826
 827   ping -t 255 224.66.66.66
 828
 829   If nobody answers, mbone does not work.
 830
 831   ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
 832   ip addr add 10.66.66.<somewhat>/24 dev Universe
 833   ifconfig Universe up
 834   ifconfig Universe add fe80::<Your_real_addr>/10
 835   ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
 836   ftp 10.66.66.66
 837   ...
 838   ftp fec0:6666:6666::193.233.7.65
 839   ...
 840 */
 841static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
 842                        unsigned short type,
 843                        const void *daddr, const void *saddr, unsigned int len)
 844{
 845        struct ip_tunnel *t = netdev_priv(dev);
 846        struct iphdr *iph;
 847        struct gre_base_hdr *greh;
 848
 849        iph = skb_push(skb, t->hlen + sizeof(*iph));
 850        greh = (struct gre_base_hdr *)(iph+1);
 851        greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
 852        greh->protocol = htons(type);
 853
 854        memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
 855
 856        /* Set the source hardware address. */
 857        if (saddr)
 858                memcpy(&iph->saddr, saddr, 4);
 859        if (daddr)
 860                memcpy(&iph->daddr, daddr, 4);
 861        if (iph->daddr)
 862                return t->hlen + sizeof(*iph);
 863
 864        return -(t->hlen + sizeof(*iph));
 865}
 866
 867static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
 868{
 869        const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
 870        memcpy(haddr, &iph->saddr, 4);
 871        return 4;
 872}
 873
 874static const struct header_ops ipgre_header_ops = {
 875        .create = ipgre_header,
 876        .parse  = ipgre_header_parse,
 877};
 878
 879#ifdef CONFIG_NET_IPGRE_BROADCAST
 880static int ipgre_open(struct net_device *dev)
 881{
 882        struct ip_tunnel *t = netdev_priv(dev);
 883
 884        if (ipv4_is_multicast(t->parms.iph.daddr)) {
 885                struct flowi4 fl4;
 886                struct rtable *rt;
 887
 888                rt = ip_route_output_gre(t->net, &fl4,
 889                                         t->parms.iph.daddr,
 890                                         t->parms.iph.saddr,
 891                                         t->parms.o_key,
 892                                         RT_TOS(t->parms.iph.tos),
 893                                         t->parms.link);
 894                if (IS_ERR(rt))
 895                        return -EADDRNOTAVAIL;
 896                dev = rt->dst.dev;
 897                ip_rt_put(rt);
 898                if (!__in_dev_get_rtnl(dev))
 899                        return -EADDRNOTAVAIL;
 900                t->mlink = dev->ifindex;
 901                ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
 902        }
 903        return 0;
 904}
 905
 906static int ipgre_close(struct net_device *dev)
 907{
 908        struct ip_tunnel *t = netdev_priv(dev);
 909
 910        if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
 911                struct in_device *in_dev;
 912                in_dev = inetdev_by_index(t->net, t->mlink);
 913                if (in_dev)
 914                        ip_mc_dec_group(in_dev, t->parms.iph.daddr);
 915        }
 916        return 0;
 917}
 918#endif
 919
 920static const struct net_device_ops ipgre_netdev_ops = {
 921        .ndo_init               = ipgre_tunnel_init,
 922        .ndo_uninit             = ip_tunnel_uninit,
 923#ifdef CONFIG_NET_IPGRE_BROADCAST
 924        .ndo_open               = ipgre_open,
 925        .ndo_stop               = ipgre_close,
 926#endif
 927        .ndo_start_xmit         = ipgre_xmit,
 928        .ndo_do_ioctl           = ip_tunnel_ioctl,
 929        .ndo_change_mtu         = ip_tunnel_change_mtu,
 930        .ndo_get_stats64        = dev_get_tstats64,
 931        .ndo_get_iflink         = ip_tunnel_get_iflink,
 932        .ndo_tunnel_ctl         = ipgre_tunnel_ctl,
 933};
 934
 935#define GRE_FEATURES (NETIF_F_SG |              \
 936                      NETIF_F_FRAGLIST |        \
 937                      NETIF_F_HIGHDMA |         \
 938                      NETIF_F_HW_CSUM)
 939
 940static void ipgre_tunnel_setup(struct net_device *dev)
 941{
 942        dev->netdev_ops         = &ipgre_netdev_ops;
 943        dev->type               = ARPHRD_IPGRE;
 944        ip_tunnel_setup(dev, ipgre_net_id);
 945}
 946
 947static void __gre_tunnel_init(struct net_device *dev)
 948{
 949        struct ip_tunnel *tunnel;
 950
 951        tunnel = netdev_priv(dev);
 952        tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
 953        tunnel->parms.iph.protocol = IPPROTO_GRE;
 954
 955        tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
 956        dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph);
 957
 958        dev->features           |= GRE_FEATURES;
 959        dev->hw_features        |= GRE_FEATURES;
 960
 961        if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
 962                /* TCP offload with GRE SEQ is not supported, nor
 963                 * can we support 2 levels of outer headers requiring
 964                 * an update.
 965                 */
 966                if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
 967                    (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
 968                        dev->features    |= NETIF_F_GSO_SOFTWARE;
 969                        dev->hw_features |= NETIF_F_GSO_SOFTWARE;
 970                }
 971
 972                /* Can use a lockless transmit, unless we generate
 973                 * output sequences
 974                 */
 975                dev->features |= NETIF_F_LLTX;
 976        }
 977}
 978
 979static int ipgre_tunnel_init(struct net_device *dev)
 980{
 981        struct ip_tunnel *tunnel = netdev_priv(dev);
 982        struct iphdr *iph = &tunnel->parms.iph;
 983
 984        __gre_tunnel_init(dev);
 985
 986        memcpy(dev->dev_addr, &iph->saddr, 4);
 987        memcpy(dev->broadcast, &iph->daddr, 4);
 988
 989        dev->flags              = IFF_NOARP;
 990        netif_keep_dst(dev);
 991        dev->addr_len           = 4;
 992
 993        if (iph->daddr && !tunnel->collect_md) {
 994#ifdef CONFIG_NET_IPGRE_BROADCAST
 995                if (ipv4_is_multicast(iph->daddr)) {
 996                        if (!iph->saddr)
 997                                return -EINVAL;
 998                        dev->flags = IFF_BROADCAST;
 999                        dev->header_ops = &ipgre_header_ops;
1000                        dev->hard_header_len = tunnel->hlen + sizeof(*iph);
1001                        dev->needed_headroom = 0;
1002                }
1003#endif
1004        } else if (!tunnel->collect_md) {
1005                dev->header_ops = &ipgre_header_ops;
1006                dev->hard_header_len = tunnel->hlen + sizeof(*iph);
1007                dev->needed_headroom = 0;
1008        }
1009
1010        return ip_tunnel_init(dev);
1011}
1012
1013static const struct gre_protocol ipgre_protocol = {
1014        .handler     = gre_rcv,
1015        .err_handler = gre_err,
1016};
1017
1018static int __net_init ipgre_init_net(struct net *net)
1019{
1020        return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
1021}
1022
1023static void __net_exit ipgre_exit_batch_net(struct list_head *list_net)
1024{
1025        ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops);
1026}
1027
1028static struct pernet_operations ipgre_net_ops = {
1029        .init = ipgre_init_net,
1030        .exit_batch = ipgre_exit_batch_net,
1031        .id   = &ipgre_net_id,
1032        .size = sizeof(struct ip_tunnel_net),
1033};
1034
1035static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
1036                                 struct netlink_ext_ack *extack)
1037{
1038        __be16 flags;
1039
1040        if (!data)
1041                return 0;
1042
1043        flags = 0;
1044        if (data[IFLA_GRE_IFLAGS])
1045                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1046        if (data[IFLA_GRE_OFLAGS])
1047                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1048        if (flags & (GRE_VERSION|GRE_ROUTING))
1049                return -EINVAL;
1050
1051        if (data[IFLA_GRE_COLLECT_METADATA] &&
1052            data[IFLA_GRE_ENCAP_TYPE] &&
1053            nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE)
1054                return -EINVAL;
1055
1056        return 0;
1057}
1058
1059static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
1060                              struct netlink_ext_ack *extack)
1061{
1062        __be32 daddr;
1063
1064        if (tb[IFLA_ADDRESS]) {
1065                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1066                        return -EINVAL;
1067                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1068                        return -EADDRNOTAVAIL;
1069        }
1070
1071        if (!data)
1072                goto out;
1073
1074        if (data[IFLA_GRE_REMOTE]) {
1075                memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1076                if (!daddr)
1077                        return -EINVAL;
1078        }
1079
1080out:
1081        return ipgre_tunnel_validate(tb, data, extack);
1082}
1083
1084static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
1085                           struct netlink_ext_ack *extack)
1086{
1087        __be16 flags = 0;
1088        int ret;
1089
1090        if (!data)
1091                return 0;
1092
1093        ret = ipgre_tap_validate(tb, data, extack);
1094        if (ret)
1095                return ret;
1096
1097        if (data[IFLA_GRE_ERSPAN_VER] &&
1098            nla_get_u8(data[IFLA_GRE_ERSPAN_VER]) == 0)
1099                return 0;
1100
1101        /* ERSPAN type II/III should only have GRE sequence and key flag */
1102        if (data[IFLA_GRE_OFLAGS])
1103                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1104        if (data[IFLA_GRE_IFLAGS])
1105                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1106        if (!data[IFLA_GRE_COLLECT_METADATA] &&
1107            flags != (GRE_SEQ | GRE_KEY))
1108                return -EINVAL;
1109
1110        /* ERSPAN Session ID only has 10-bit. Since we reuse
1111         * 32-bit key field as ID, check it's range.
1112         */
1113        if (data[IFLA_GRE_IKEY] &&
1114            (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
1115                return -EINVAL;
1116
1117        if (data[IFLA_GRE_OKEY] &&
1118            (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
1119                return -EINVAL;
1120
1121        return 0;
1122}
1123
1124static int ipgre_netlink_parms(struct net_device *dev,
1125                                struct nlattr *data[],
1126                                struct nlattr *tb[],
1127                                struct ip_tunnel_parm *parms,
1128                                __u32 *fwmark)
1129{
1130        struct ip_tunnel *t = netdev_priv(dev);
1131
1132        memset(parms, 0, sizeof(*parms));
1133
1134        parms->iph.protocol = IPPROTO_GRE;
1135
1136        if (!data)
1137                return 0;
1138
1139        if (data[IFLA_GRE_LINK])
1140                parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1141
1142        if (data[IFLA_GRE_IFLAGS])
1143                parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
1144
1145        if (data[IFLA_GRE_OFLAGS])
1146                parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
1147
1148        if (data[IFLA_GRE_IKEY])
1149                parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1150
1151        if (data[IFLA_GRE_OKEY])
1152                parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1153
1154        if (data[IFLA_GRE_LOCAL])
1155                parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
1156
1157        if (data[IFLA_GRE_REMOTE])
1158                parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
1159
1160        if (data[IFLA_GRE_TTL])
1161                parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1162
1163        if (data[IFLA_GRE_TOS])
1164                parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1165
1166        if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) {
1167                if (t->ignore_df)
1168                        return -EINVAL;
1169                parms->iph.frag_off = htons(IP_DF);
1170        }
1171
1172        if (data[IFLA_GRE_COLLECT_METADATA]) {
1173                t->collect_md = true;
1174                if (dev->type == ARPHRD_IPGRE)
1175                        dev->type = ARPHRD_NONE;
1176        }
1177
1178        if (data[IFLA_GRE_IGNORE_DF]) {
1179                if (nla_get_u8(data[IFLA_GRE_IGNORE_DF])
1180                  && (parms->iph.frag_off & htons(IP_DF)))
1181                        return -EINVAL;
1182                t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
1183        }
1184
1185        if (data[IFLA_GRE_FWMARK])
1186                *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
1187
1188        return 0;
1189}
1190
1191static int erspan_netlink_parms(struct net_device *dev,
1192                                struct nlattr *data[],
1193                                struct nlattr *tb[],
1194                                struct ip_tunnel_parm *parms,
1195                                __u32 *fwmark)
1196{
1197        struct ip_tunnel *t = netdev_priv(dev);
1198        int err;
1199
1200        err = ipgre_netlink_parms(dev, data, tb, parms, fwmark);
1201        if (err)
1202                return err;
1203        if (!data)
1204                return 0;
1205
1206        if (data[IFLA_GRE_ERSPAN_VER]) {
1207                t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
1208
1209                if (t->erspan_ver > 2)
1210                        return -EINVAL;
1211        }
1212
1213        if (t->erspan_ver == 1) {
1214                if (data[IFLA_GRE_ERSPAN_INDEX]) {
1215                        t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
1216                        if (t->index & ~INDEX_MASK)
1217                                return -EINVAL;
1218                }
1219        } else if (t->erspan_ver == 2) {
1220                if (data[IFLA_GRE_ERSPAN_DIR]) {
1221                        t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
1222                        if (t->dir & ~(DIR_MASK >> DIR_OFFSET))
1223                                return -EINVAL;
1224                }
1225                if (data[IFLA_GRE_ERSPAN_HWID]) {
1226                        t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
1227                        if (t->hwid & ~(HWID_MASK >> HWID_OFFSET))
1228                                return -EINVAL;
1229                }
1230        }
1231
1232        return 0;
1233}
1234
1235/* This function returns true when ENCAP attributes are present in the nl msg */
1236static bool ipgre_netlink_encap_parms(struct nlattr *data[],
1237                                      struct ip_tunnel_encap *ipencap)
1238{
1239        bool ret = false;
1240
1241        memset(ipencap, 0, sizeof(*ipencap));
1242
1243        if (!data)
1244                return ret;
1245
1246        if (data[IFLA_GRE_ENCAP_TYPE]) {
1247                ret = true;
1248                ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
1249        }
1250
1251        if (data[IFLA_GRE_ENCAP_FLAGS]) {
1252                ret = true;
1253                ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
1254        }
1255
1256        if (data[IFLA_GRE_ENCAP_SPORT]) {
1257                ret = true;
1258                ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
1259        }
1260
1261        if (data[IFLA_GRE_ENCAP_DPORT]) {
1262                ret = true;
1263                ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
1264        }
1265
1266        return ret;
1267}
1268
1269static int gre_tap_init(struct net_device *dev)
1270{
1271        __gre_tunnel_init(dev);
1272        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1273        netif_keep_dst(dev);
1274
1275        return ip_tunnel_init(dev);
1276}
1277
1278static const struct net_device_ops gre_tap_netdev_ops = {
1279        .ndo_init               = gre_tap_init,
1280        .ndo_uninit             = ip_tunnel_uninit,
1281        .ndo_start_xmit         = gre_tap_xmit,
1282        .ndo_set_mac_address    = eth_mac_addr,
1283        .ndo_validate_addr      = eth_validate_addr,
1284        .ndo_change_mtu         = ip_tunnel_change_mtu,
1285        .ndo_get_stats64        = dev_get_tstats64,
1286        .ndo_get_iflink         = ip_tunnel_get_iflink,
1287        .ndo_fill_metadata_dst  = gre_fill_metadata_dst,
1288};
1289
1290static int erspan_tunnel_init(struct net_device *dev)
1291{
1292        struct ip_tunnel *tunnel = netdev_priv(dev);
1293
1294        if (tunnel->erspan_ver == 0)
1295                tunnel->tun_hlen = 4; /* 4-byte GRE hdr. */
1296        else
1297                tunnel->tun_hlen = 8; /* 8-byte GRE hdr. */
1298
1299        tunnel->parms.iph.protocol = IPPROTO_GRE;
1300        tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
1301                       erspan_hdr_len(tunnel->erspan_ver);
1302
1303        dev->features           |= GRE_FEATURES;
1304        dev->hw_features        |= GRE_FEATURES;
1305        dev->priv_flags         |= IFF_LIVE_ADDR_CHANGE;
1306        netif_keep_dst(dev);
1307
1308        return ip_tunnel_init(dev);
1309}
1310
1311static const struct net_device_ops erspan_netdev_ops = {
1312        .ndo_init               = erspan_tunnel_init,
1313        .ndo_uninit             = ip_tunnel_uninit,
1314        .ndo_start_xmit         = erspan_xmit,
1315        .ndo_set_mac_address    = eth_mac_addr,
1316        .ndo_validate_addr      = eth_validate_addr,
1317        .ndo_change_mtu         = ip_tunnel_change_mtu,
1318        .ndo_get_stats64        = dev_get_tstats64,
1319        .ndo_get_iflink         = ip_tunnel_get_iflink,
1320        .ndo_fill_metadata_dst  = gre_fill_metadata_dst,
1321};
1322
1323static void ipgre_tap_setup(struct net_device *dev)
1324{
1325        ether_setup(dev);
1326        dev->max_mtu = 0;
1327        dev->netdev_ops = &gre_tap_netdev_ops;
1328        dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1329        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1330        ip_tunnel_setup(dev, gre_tap_net_id);
1331}
1332
1333static int
1334ipgre_newlink_encap_setup(struct net_device *dev, struct nlattr *data[])
1335{
1336        struct ip_tunnel_encap ipencap;
1337
1338        if (ipgre_netlink_encap_parms(data, &ipencap)) {
1339                struct ip_tunnel *t = netdev_priv(dev);
1340                int err = ip_tunnel_encap_setup(t, &ipencap);
1341
1342                if (err < 0)
1343                        return err;
1344        }
1345
1346        return 0;
1347}
1348
1349static int ipgre_newlink(struct net *src_net, struct net_device *dev,
1350                         struct nlattr *tb[], struct nlattr *data[],
1351                         struct netlink_ext_ack *extack)
1352{
1353        struct ip_tunnel_parm p;
1354        __u32 fwmark = 0;
1355        int err;
1356
1357        err = ipgre_newlink_encap_setup(dev, data);
1358        if (err)
1359                return err;
1360
1361        err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1362        if (err < 0)
1363                return err;
1364        return ip_tunnel_newlink(dev, tb, &p, fwmark);
1365}
1366
1367static int erspan_newlink(struct net *src_net, struct net_device *dev,
1368                          struct nlattr *tb[], struct nlattr *data[],
1369                          struct netlink_ext_ack *extack)
1370{
1371        struct ip_tunnel_parm p;
1372        __u32 fwmark = 0;
1373        int err;
1374
1375        err = ipgre_newlink_encap_setup(dev, data);
1376        if (err)
1377                return err;
1378
1379        err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
1380        if (err)
1381                return err;
1382        return ip_tunnel_newlink(dev, tb, &p, fwmark);
1383}
1384
1385static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1386                            struct nlattr *data[],
1387                            struct netlink_ext_ack *extack)
1388{
1389        struct ip_tunnel *t = netdev_priv(dev);
1390        __u32 fwmark = t->fwmark;
1391        struct ip_tunnel_parm p;
1392        int err;
1393
1394        err = ipgre_newlink_encap_setup(dev, data);
1395        if (err)
1396                return err;
1397
1398        err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1399        if (err < 0)
1400                return err;
1401
1402        err = ip_tunnel_changelink(dev, tb, &p, fwmark);
1403        if (err < 0)
1404                return err;
1405
1406        t->parms.i_flags = p.i_flags;
1407        t->parms.o_flags = p.o_flags;
1408
1409        ipgre_link_update(dev, !tb[IFLA_MTU]);
1410
1411        return 0;
1412}
1413
1414static int erspan_changelink(struct net_device *dev, struct nlattr *tb[],
1415                             struct nlattr *data[],
1416                             struct netlink_ext_ack *extack)
1417{
1418        struct ip_tunnel *t = netdev_priv(dev);
1419        __u32 fwmark = t->fwmark;
1420        struct ip_tunnel_parm p;
1421        int err;
1422
1423        err = ipgre_newlink_encap_setup(dev, data);
1424        if (err)
1425                return err;
1426
1427        err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
1428        if (err < 0)
1429                return err;
1430
1431        err = ip_tunnel_changelink(dev, tb, &p, fwmark);
1432        if (err < 0)
1433                return err;
1434
1435        t->parms.i_flags = p.i_flags;
1436        t->parms.o_flags = p.o_flags;
1437
1438        return 0;
1439}
1440
1441static size_t ipgre_get_size(const struct net_device *dev)
1442{
1443        return
1444                /* IFLA_GRE_LINK */
1445                nla_total_size(4) +
1446                /* IFLA_GRE_IFLAGS */
1447                nla_total_size(2) +
1448                /* IFLA_GRE_OFLAGS */
1449                nla_total_size(2) +
1450                /* IFLA_GRE_IKEY */
1451                nla_total_size(4) +
1452                /* IFLA_GRE_OKEY */
1453                nla_total_size(4) +
1454                /* IFLA_GRE_LOCAL */
1455                nla_total_size(4) +
1456                /* IFLA_GRE_REMOTE */
1457                nla_total_size(4) +
1458                /* IFLA_GRE_TTL */
1459                nla_total_size(1) +
1460                /* IFLA_GRE_TOS */
1461                nla_total_size(1) +
1462                /* IFLA_GRE_PMTUDISC */
1463                nla_total_size(1) +
1464                /* IFLA_GRE_ENCAP_TYPE */
1465                nla_total_size(2) +
1466                /* IFLA_GRE_ENCAP_FLAGS */
1467                nla_total_size(2) +
1468                /* IFLA_GRE_ENCAP_SPORT */
1469                nla_total_size(2) +
1470                /* IFLA_GRE_ENCAP_DPORT */
1471                nla_total_size(2) +
1472                /* IFLA_GRE_COLLECT_METADATA */
1473                nla_total_size(0) +
1474                /* IFLA_GRE_IGNORE_DF */
1475                nla_total_size(1) +
1476                /* IFLA_GRE_FWMARK */
1477                nla_total_size(4) +
1478                /* IFLA_GRE_ERSPAN_INDEX */
1479                nla_total_size(4) +
1480                /* IFLA_GRE_ERSPAN_VER */
1481                nla_total_size(1) +
1482                /* IFLA_GRE_ERSPAN_DIR */
1483                nla_total_size(1) +
1484                /* IFLA_GRE_ERSPAN_HWID */
1485                nla_total_size(2) +
1486                0;
1487}
1488
1489static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1490{
1491        struct ip_tunnel *t = netdev_priv(dev);
1492        struct ip_tunnel_parm *p = &t->parms;
1493        __be16 o_flags = p->o_flags;
1494
1495        if (t->erspan_ver <= 2) {
1496                if (t->erspan_ver != 0 && !t->collect_md)
1497                        o_flags |= TUNNEL_KEY;
1498
1499                if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
1500                        goto nla_put_failure;
1501
1502                if (t->erspan_ver == 1) {
1503                        if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
1504                                goto nla_put_failure;
1505                } else if (t->erspan_ver == 2) {
1506                        if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
1507                                goto nla_put_failure;
1508                        if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
1509                                goto nla_put_failure;
1510                }
1511        }
1512
1513        if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1514            nla_put_be16(skb, IFLA_GRE_IFLAGS,
1515                         gre_tnl_flags_to_gre_flags(p->i_flags)) ||
1516            nla_put_be16(skb, IFLA_GRE_OFLAGS,
1517                         gre_tnl_flags_to_gre_flags(o_flags)) ||
1518            nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1519            nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1520            nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1521            nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1522            nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1523            nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1524            nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1525                       !!(p->iph.frag_off & htons(IP_DF))) ||
1526            nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark))
1527                goto nla_put_failure;
1528
1529        if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
1530                        t->encap.type) ||
1531            nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
1532                         t->encap.sport) ||
1533            nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
1534                         t->encap.dport) ||
1535            nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
1536                        t->encap.flags))
1537                goto nla_put_failure;
1538
1539        if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df))
1540                goto nla_put_failure;
1541
1542        if (t->collect_md) {
1543                if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
1544                        goto nla_put_failure;
1545        }
1546
1547        return 0;
1548
1549nla_put_failure:
1550        return -EMSGSIZE;
1551}
1552
1553static void erspan_setup(struct net_device *dev)
1554{
1555        struct ip_tunnel *t = netdev_priv(dev);
1556
1557        ether_setup(dev);
1558        dev->max_mtu = 0;
1559        dev->netdev_ops = &erspan_netdev_ops;
1560        dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1561        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1562        ip_tunnel_setup(dev, erspan_net_id);
1563        t->erspan_ver = 1;
1564}
1565
1566static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1567        [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1568        [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1569        [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1570        [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1571        [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1572        [IFLA_GRE_LOCAL]        = { .len = sizeof_field(struct iphdr, saddr) },
1573        [IFLA_GRE_REMOTE]       = { .len = sizeof_field(struct iphdr, daddr) },
1574        [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1575        [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1576        [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1577        [IFLA_GRE_ENCAP_TYPE]   = { .type = NLA_U16 },
1578        [IFLA_GRE_ENCAP_FLAGS]  = { .type = NLA_U16 },
1579        [IFLA_GRE_ENCAP_SPORT]  = { .type = NLA_U16 },
1580        [IFLA_GRE_ENCAP_DPORT]  = { .type = NLA_U16 },
1581        [IFLA_GRE_COLLECT_METADATA]     = { .type = NLA_FLAG },
1582        [IFLA_GRE_IGNORE_DF]    = { .type = NLA_U8 },
1583        [IFLA_GRE_FWMARK]       = { .type = NLA_U32 },
1584        [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 },
1585        [IFLA_GRE_ERSPAN_VER]   = { .type = NLA_U8 },
1586        [IFLA_GRE_ERSPAN_DIR]   = { .type = NLA_U8 },
1587        [IFLA_GRE_ERSPAN_HWID]  = { .type = NLA_U16 },
1588};
1589
1590static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1591        .kind           = "gre",
1592        .maxtype        = IFLA_GRE_MAX,
1593        .policy         = ipgre_policy,
1594        .priv_size      = sizeof(struct ip_tunnel),
1595        .setup          = ipgre_tunnel_setup,
1596        .validate       = ipgre_tunnel_validate,
1597        .newlink        = ipgre_newlink,
1598        .changelink     = ipgre_changelink,
1599        .dellink        = ip_tunnel_dellink,
1600        .get_size       = ipgre_get_size,
1601        .fill_info      = ipgre_fill_info,
1602        .get_link_net   = ip_tunnel_get_link_net,
1603};
1604
1605static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1606        .kind           = "gretap",
1607        .maxtype        = IFLA_GRE_MAX,
1608        .policy         = ipgre_policy,
1609        .priv_size      = sizeof(struct ip_tunnel),
1610        .setup          = ipgre_tap_setup,
1611        .validate       = ipgre_tap_validate,
1612        .newlink        = ipgre_newlink,
1613        .changelink     = ipgre_changelink,
1614        .dellink        = ip_tunnel_dellink,
1615        .get_size       = ipgre_get_size,
1616        .fill_info      = ipgre_fill_info,
1617        .get_link_net   = ip_tunnel_get_link_net,
1618};
1619
1620static struct rtnl_link_ops erspan_link_ops __read_mostly = {
1621        .kind           = "erspan",
1622        .maxtype        = IFLA_GRE_MAX,
1623        .policy         = ipgre_policy,
1624        .priv_size      = sizeof(struct ip_tunnel),
1625        .setup          = erspan_setup,
1626        .validate       = erspan_validate,
1627        .newlink        = erspan_newlink,
1628        .changelink     = erspan_changelink,
1629        .dellink        = ip_tunnel_dellink,
1630        .get_size       = ipgre_get_size,
1631        .fill_info      = ipgre_fill_info,
1632        .get_link_net   = ip_tunnel_get_link_net,
1633};
1634
1635struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
1636                                        u8 name_assign_type)
1637{
1638        struct nlattr *tb[IFLA_MAX + 1];
1639        struct net_device *dev;
1640        LIST_HEAD(list_kill);
1641        struct ip_tunnel *t;
1642        int err;
1643
1644        memset(&tb, 0, sizeof(tb));
1645
1646        dev = rtnl_create_link(net, name, name_assign_type,
1647                               &ipgre_tap_ops, tb, NULL);
1648        if (IS_ERR(dev))
1649                return dev;
1650
1651        /* Configure flow based GRE device. */
1652        t = netdev_priv(dev);
1653        t->collect_md = true;
1654
1655        err = ipgre_newlink(net, dev, tb, NULL, NULL);
1656        if (err < 0) {
1657                free_netdev(dev);
1658                return ERR_PTR(err);
1659        }
1660
1661        /* openvswitch users expect packet sizes to be unrestricted,
1662         * so set the largest MTU we can.
1663         */
1664        err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
1665        if (err)
1666                goto out;
1667
1668        err = rtnl_configure_link(dev, NULL);
1669        if (err < 0)
1670                goto out;
1671
1672        return dev;
1673out:
1674        ip_tunnel_dellink(dev, &list_kill);
1675        unregister_netdevice_many(&list_kill);
1676        return ERR_PTR(err);
1677}
1678EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
1679
1680static int __net_init ipgre_tap_init_net(struct net *net)
1681{
1682        return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
1683}
1684
1685static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net)
1686{
1687        ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops);
1688}
1689
1690static struct pernet_operations ipgre_tap_net_ops = {
1691        .init = ipgre_tap_init_net,
1692        .exit_batch = ipgre_tap_exit_batch_net,
1693        .id   = &gre_tap_net_id,
1694        .size = sizeof(struct ip_tunnel_net),
1695};
1696
1697static int __net_init erspan_init_net(struct net *net)
1698{
1699        return ip_tunnel_init_net(net, erspan_net_id,
1700                                  &erspan_link_ops, "erspan0");
1701}
1702
1703static void __net_exit erspan_exit_batch_net(struct list_head *net_list)
1704{
1705        ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops);
1706}
1707
1708static struct pernet_operations erspan_net_ops = {
1709        .init = erspan_init_net,
1710        .exit_batch = erspan_exit_batch_net,
1711        .id   = &erspan_net_id,
1712        .size = sizeof(struct ip_tunnel_net),
1713};
1714
1715static int __init ipgre_init(void)
1716{
1717        int err;
1718
1719        pr_info("GRE over IPv4 tunneling driver\n");
1720
1721        err = register_pernet_device(&ipgre_net_ops);
1722        if (err < 0)
1723                return err;
1724
1725        err = register_pernet_device(&ipgre_tap_net_ops);
1726        if (err < 0)
1727                goto pnet_tap_failed;
1728
1729        err = register_pernet_device(&erspan_net_ops);
1730        if (err < 0)
1731                goto pnet_erspan_failed;
1732
1733        err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1734        if (err < 0) {
1735                pr_info("%s: can't add protocol\n", __func__);
1736                goto add_proto_failed;
1737        }
1738
1739        err = rtnl_link_register(&ipgre_link_ops);
1740        if (err < 0)
1741                goto rtnl_link_failed;
1742
1743        err = rtnl_link_register(&ipgre_tap_ops);
1744        if (err < 0)
1745                goto tap_ops_failed;
1746
1747        err = rtnl_link_register(&erspan_link_ops);
1748        if (err < 0)
1749                goto erspan_link_failed;
1750
1751        return 0;
1752
1753erspan_link_failed:
1754        rtnl_link_unregister(&ipgre_tap_ops);
1755tap_ops_failed:
1756        rtnl_link_unregister(&ipgre_link_ops);
1757rtnl_link_failed:
1758        gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1759add_proto_failed:
1760        unregister_pernet_device(&erspan_net_ops);
1761pnet_erspan_failed:
1762        unregister_pernet_device(&ipgre_tap_net_ops);
1763pnet_tap_failed:
1764        unregister_pernet_device(&ipgre_net_ops);
1765        return err;
1766}
1767
1768static void __exit ipgre_fini(void)
1769{
1770        rtnl_link_unregister(&ipgre_tap_ops);
1771        rtnl_link_unregister(&ipgre_link_ops);
1772        rtnl_link_unregister(&erspan_link_ops);
1773        gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1774        unregister_pernet_device(&ipgre_tap_net_ops);
1775        unregister_pernet_device(&ipgre_net_ops);
1776        unregister_pernet_device(&erspan_net_ops);
1777}
1778
1779module_init(ipgre_init);
1780module_exit(ipgre_fini);
1781MODULE_LICENSE("GPL");
1782MODULE_ALIAS_RTNL_LINK("gre");
1783MODULE_ALIAS_RTNL_LINK("gretap");
1784MODULE_ALIAS_RTNL_LINK("erspan");
1785MODULE_ALIAS_NETDEV("gre0");
1786MODULE_ALIAS_NETDEV("gretap0");
1787MODULE_ALIAS_NETDEV("erspan0");
1788