linux/net/ipv4/ip_gre.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *      Linux NET3:     GRE over IP protocol decoder.
   4 *
   5 *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
   6 */
   7
   8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   9
  10#include <linux/capability.h>
  11#include <linux/module.h>
  12#include <linux/types.h>
  13#include <linux/kernel.h>
  14#include <linux/slab.h>
  15#include <linux/uaccess.h>
  16#include <linux/skbuff.h>
  17#include <linux/netdevice.h>
  18#include <linux/in.h>
  19#include <linux/tcp.h>
  20#include <linux/udp.h>
  21#include <linux/if_arp.h>
  22#include <linux/if_vlan.h>
  23#include <linux/init.h>
  24#include <linux/in6.h>
  25#include <linux/inetdevice.h>
  26#include <linux/igmp.h>
  27#include <linux/netfilter_ipv4.h>
  28#include <linux/etherdevice.h>
  29#include <linux/if_ether.h>
  30
  31#include <net/sock.h>
  32#include <net/ip.h>
  33#include <net/icmp.h>
  34#include <net/protocol.h>
  35#include <net/ip_tunnels.h>
  36#include <net/arp.h>
  37#include <net/checksum.h>
  38#include <net/dsfield.h>
  39#include <net/inet_ecn.h>
  40#include <net/xfrm.h>
  41#include <net/net_namespace.h>
  42#include <net/netns/generic.h>
  43#include <net/rtnetlink.h>
  44#include <net/gre.h>
  45#include <net/dst_metadata.h>
  46#include <net/erspan.h>
  47
  48/*
  49   Problems & solutions
  50   --------------------
  51
  52   1. The most important issue is detecting local dead loops.
  53   They would cause complete host lockup in transmit, which
  54   would be "resolved" by stack overflow or, if queueing is enabled,
  55   with infinite looping in net_bh.
  56
  57   We cannot track such dead loops during route installation,
  58   it is infeasible task. The most general solutions would be
  59   to keep skb->encapsulation counter (sort of local ttl),
  60   and silently drop packet when it expires. It is a good
  61   solution, but it supposes maintaining new variable in ALL
  62   skb, even if no tunneling is used.
  63
  64   Current solution: xmit_recursion breaks dead loops. This is a percpu
  65   counter, since when we enter the first ndo_xmit(), cpu migration is
  66   forbidden. We force an exit if this counter reaches RECURSION_LIMIT
  67
  68   2. Networking dead loops would not kill routers, but would really
  69   kill network. IP hop limit plays role of "t->recursion" in this case,
  70   if we copy it from packet being encapsulated to upper header.
  71   It is very good solution, but it introduces two problems:
  72
  73   - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
  74     do not work over tunnels.
  75   - traceroute does not work. I planned to relay ICMP from tunnel,
  76     so that this problem would be solved and traceroute output
  77     would even more informative. This idea appeared to be wrong:
  78     only Linux complies to rfc1812 now (yes, guys, Linux is the only
  79     true router now :-)), all routers (at least, in neighbourhood of mine)
  80     return only 8 bytes of payload. It is the end.
  81
  82   Hence, if we want that OSPF worked or traceroute said something reasonable,
  83   we should search for another solution.
  84
  85   One of them is to parse packet trying to detect inner encapsulation
  86   made by our node. It is difficult or even impossible, especially,
  87   taking into account fragmentation. TO be short, ttl is not solution at all.
  88
  89   Current solution: The solution was UNEXPECTEDLY SIMPLE.
  90   We force DF flag on tunnels with preconfigured hop limit,
  91   that is ALL. :-) Well, it does not remove the problem completely,
  92   but exponential growth of network traffic is changed to linear
  93   (branches, that exceed pmtu are pruned) and tunnel mtu
  94   rapidly degrades to value <68, where looping stops.
  95   Yes, it is not good if there exists a router in the loop,
  96   which does not force DF, even when encapsulating packets have DF set.
  97   But it is not our problem! Nobody could accuse us, we made
  98   all that we could make. Even if it is your gated who injected
  99   fatal route to network, even if it were you who configured
 100   fatal static route: you are innocent. :-)
 101
 102   Alexey Kuznetsov.
 103 */
 104
 105static bool log_ecn_error = true;
 106module_param(log_ecn_error, bool, 0644);
 107MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 108
 109static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 110static int ipgre_tunnel_init(struct net_device *dev);
 111static void erspan_build_header(struct sk_buff *skb,
 112                                u32 id, u32 index,
 113                                bool truncate, bool is_ipv4);
 114
 115static unsigned int ipgre_net_id __read_mostly;
 116static unsigned int gre_tap_net_id __read_mostly;
 117static unsigned int erspan_net_id __read_mostly;
 118
 119static int ipgre_err(struct sk_buff *skb, u32 info,
 120                     const struct tnl_ptk_info *tpi)
 121{
 122
 123        /* All the routers (except for Linux) return only
 124           8 bytes of packet payload. It means, that precise relaying of
 125           ICMP in the real Internet is absolutely infeasible.
 126
 127           Moreover, Cisco "wise men" put GRE key to the third word
 128           in GRE header. It makes impossible maintaining even soft
 129           state for keyed GRE tunnels with enabled checksum. Tell
 130           them "thank you".
 131
 132           Well, I wonder, rfc1812 was written by Cisco employee,
 133           what the hell these idiots break standards established
 134           by themselves???
 135           */
 136        struct net *net = dev_net(skb->dev);
 137        struct ip_tunnel_net *itn;
 138        const struct iphdr *iph;
 139        const int type = icmp_hdr(skb)->type;
 140        const int code = icmp_hdr(skb)->code;
 141        unsigned int data_len = 0;
 142        struct ip_tunnel *t;
 143
 144        if (tpi->proto == htons(ETH_P_TEB))
 145                itn = net_generic(net, gre_tap_net_id);
 146        else if (tpi->proto == htons(ETH_P_ERSPAN) ||
 147                 tpi->proto == htons(ETH_P_ERSPAN2))
 148                itn = net_generic(net, erspan_net_id);
 149        else
 150                itn = net_generic(net, ipgre_net_id);
 151
 152        iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
 153        t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
 154                             iph->daddr, iph->saddr, tpi->key);
 155
 156        if (!t)
 157                return -ENOENT;
 158
 159        switch (type) {
 160        default:
 161        case ICMP_PARAMETERPROB:
 162                return 0;
 163
 164        case ICMP_DEST_UNREACH:
 165                switch (code) {
 166                case ICMP_SR_FAILED:
 167                case ICMP_PORT_UNREACH:
 168                        /* Impossible event. */
 169                        return 0;
 170                default:
 171                        /* All others are translated to HOST_UNREACH.
 172                           rfc2003 contains "deep thoughts" about NET_UNREACH,
 173                           I believe they are just ether pollution. --ANK
 174                         */
 175                        break;
 176                }
 177                break;
 178
 179        case ICMP_TIME_EXCEEDED:
 180                if (code != ICMP_EXC_TTL)
 181                        return 0;
 182                data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
 183                break;
 184
 185        case ICMP_REDIRECT:
 186                break;
 187        }
 188
 189#if IS_ENABLED(CONFIG_IPV6)
 190       if (tpi->proto == htons(ETH_P_IPV6) &&
 191           !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
 192                                       type, data_len))
 193               return 0;
 194#endif
 195
 196        if (t->parms.iph.daddr == 0 ||
 197            ipv4_is_multicast(t->parms.iph.daddr))
 198                return 0;
 199
 200        if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
 201                return 0;
 202
 203        if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
 204                t->err_count++;
 205        else
 206                t->err_count = 1;
 207        t->err_time = jiffies;
 208
 209        return 0;
 210}
 211
 212static void gre_err(struct sk_buff *skb, u32 info)
 213{
 214        /* All the routers (except for Linux) return only
 215         * 8 bytes of packet payload. It means, that precise relaying of
 216         * ICMP in the real Internet is absolutely infeasible.
 217         *
 218         * Moreover, Cisco "wise men" put GRE key to the third word
 219         * in GRE header. It makes impossible maintaining even soft
 220         * state for keyed
 221         * GRE tunnels with enabled checksum. Tell them "thank you".
 222         *
 223         * Well, I wonder, rfc1812 was written by Cisco employee,
 224         * what the hell these idiots break standards established
 225         * by themselves???
 226         */
 227
 228        const struct iphdr *iph = (struct iphdr *)skb->data;
 229        const int type = icmp_hdr(skb)->type;
 230        const int code = icmp_hdr(skb)->code;
 231        struct tnl_ptk_info tpi;
 232
 233        if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP),
 234                             iph->ihl * 4) < 0)
 235                return;
 236
 237        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
 238                ipv4_update_pmtu(skb, dev_net(skb->dev), info,
 239                                 skb->dev->ifindex, IPPROTO_GRE);
 240                return;
 241        }
 242        if (type == ICMP_REDIRECT) {
 243                ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex,
 244                              IPPROTO_GRE);
 245                return;
 246        }
 247
 248        ipgre_err(skb, info, &tpi);
 249}
 250
 251static bool is_erspan_type1(int gre_hdr_len)
 252{
 253        /* Both ERSPAN type I (version 0) and type II (version 1) use
 254         * protocol 0x88BE, but the type I has only 4-byte GRE header,
 255         * while type II has 8-byte.
 256         */
 257        return gre_hdr_len == 4;
 258}
 259
 260static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 261                      int gre_hdr_len)
 262{
 263        struct net *net = dev_net(skb->dev);
 264        struct metadata_dst *tun_dst = NULL;
 265        struct erspan_base_hdr *ershdr;
 266        struct ip_tunnel_net *itn;
 267        struct ip_tunnel *tunnel;
 268        const struct iphdr *iph;
 269        struct erspan_md2 *md2;
 270        int ver;
 271        int len;
 272
 273        itn = net_generic(net, erspan_net_id);
 274        iph = ip_hdr(skb);
 275        if (is_erspan_type1(gre_hdr_len)) {
 276                ver = 0;
 277                tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
 278                                          tpi->flags | TUNNEL_NO_KEY,
 279                                          iph->saddr, iph->daddr, 0);
 280        } else {
 281                ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
 282                ver = ershdr->ver;
 283                tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
 284                                          tpi->flags | TUNNEL_KEY,
 285                                          iph->saddr, iph->daddr, tpi->key);
 286        }
 287
 288        if (tunnel) {
 289                if (is_erspan_type1(gre_hdr_len))
 290                        len = gre_hdr_len;
 291                else
 292                        len = gre_hdr_len + erspan_hdr_len(ver);
 293
 294                if (unlikely(!pskb_may_pull(skb, len)))
 295                        return PACKET_REJECT;
 296
 297                if (__iptunnel_pull_header(skb,
 298                                           len,
 299                                           htons(ETH_P_TEB),
 300                                           false, false) < 0)
 301                        goto drop;
 302
 303                if (tunnel->collect_md) {
 304                        struct erspan_metadata *pkt_md, *md;
 305                        struct ip_tunnel_info *info;
 306                        unsigned char *gh;
 307                        __be64 tun_id;
 308                        __be16 flags;
 309
 310                        tpi->flags |= TUNNEL_KEY;
 311                        flags = tpi->flags;
 312                        tun_id = key32_to_tunnel_id(tpi->key);
 313
 314                        tun_dst = ip_tun_rx_dst(skb, flags,
 315                                                tun_id, sizeof(*md));
 316                        if (!tun_dst)
 317                                return PACKET_REJECT;
 318
 319                        /* skb can be uncloned in __iptunnel_pull_header, so
 320                         * old pkt_md is no longer valid and we need to reset
 321                         * it
 322                         */
 323                        gh = skb_network_header(skb) +
 324                             skb_network_header_len(skb);
 325                        pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len +
 326                                                            sizeof(*ershdr));
 327                        md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
 328                        md->version = ver;
 329                        md2 = &md->u.md2;
 330                        memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE :
 331                                                       ERSPAN_V2_MDSIZE);
 332
 333                        info = &tun_dst->u.tun_info;
 334                        info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
 335                        info->options_len = sizeof(*md);
 336                }
 337
 338                skb_reset_mac_header(skb);
 339                ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
 340                return PACKET_RCVD;
 341        }
 342        return PACKET_REJECT;
 343
 344drop:
 345        kfree_skb(skb);
 346        return PACKET_RCVD;
 347}
 348
 349static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 350                       struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
 351{
 352        struct metadata_dst *tun_dst = NULL;
 353        const struct iphdr *iph;
 354        struct ip_tunnel *tunnel;
 355
 356        iph = ip_hdr(skb);
 357        tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
 358                                  iph->saddr, iph->daddr, tpi->key);
 359
 360        if (tunnel) {
 361                const struct iphdr *tnl_params;
 362
 363                if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
 364                                           raw_proto, false) < 0)
 365                        goto drop;
 366
 367                if (tunnel->dev->type != ARPHRD_NONE)
 368                        skb_pop_mac_header(skb);
 369                else
 370                        skb_reset_mac_header(skb);
 371
 372                tnl_params = &tunnel->parms.iph;
 373                if (tunnel->collect_md || tnl_params->daddr == 0) {
 374                        __be16 flags;
 375                        __be64 tun_id;
 376
 377                        flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
 378                        tun_id = key32_to_tunnel_id(tpi->key);
 379                        tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
 380                        if (!tun_dst)
 381                                return PACKET_REJECT;
 382                }
 383
 384                ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
 385                return PACKET_RCVD;
 386        }
 387        return PACKET_NEXT;
 388
 389drop:
 390        kfree_skb(skb);
 391        return PACKET_RCVD;
 392}
 393
 394static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 395                     int hdr_len)
 396{
 397        struct net *net = dev_net(skb->dev);
 398        struct ip_tunnel_net *itn;
 399        int res;
 400
 401        if (tpi->proto == htons(ETH_P_TEB))
 402                itn = net_generic(net, gre_tap_net_id);
 403        else
 404                itn = net_generic(net, ipgre_net_id);
 405
 406        res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
 407        if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
 408                /* ipgre tunnels in collect metadata mode should receive
 409                 * also ETH_P_TEB traffic.
 410                 */
 411                itn = net_generic(net, ipgre_net_id);
 412                res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
 413        }
 414        return res;
 415}
 416
 417static int gre_rcv(struct sk_buff *skb)
 418{
 419        struct tnl_ptk_info tpi;
 420        bool csum_err = false;
 421        int hdr_len;
 422
 423#ifdef CONFIG_NET_IPGRE_BROADCAST
 424        if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
 425                /* Looped back packet, drop it! */
 426                if (rt_is_output_route(skb_rtable(skb)))
 427                        goto drop;
 428        }
 429#endif
 430
 431        hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
 432        if (hdr_len < 0)
 433                goto drop;
 434
 435        if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
 436                     tpi.proto == htons(ETH_P_ERSPAN2))) {
 437                if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
 438                        return 0;
 439                goto out;
 440        }
 441
 442        if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
 443                return 0;
 444
 445out:
 446        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 447drop:
 448        kfree_skb(skb);
 449        return 0;
 450}
 451
 452static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
 453                       const struct iphdr *tnl_params,
 454                       __be16 proto)
 455{
 456        struct ip_tunnel *tunnel = netdev_priv(dev);
 457
 458        if (tunnel->parms.o_flags & TUNNEL_SEQ)
 459                tunnel->o_seqno++;
 460
 461        /* Push GRE header. */
 462        gre_build_header(skb, tunnel->tun_hlen,
 463                         tunnel->parms.o_flags, proto, tunnel->parms.o_key,
 464                         htonl(tunnel->o_seqno));
 465
 466        ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
 467}
 468
 469static int gre_handle_offloads(struct sk_buff *skb, bool csum)
 470{
 471        return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
 472}
 473
 474static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 475                        __be16 proto)
 476{
 477        struct ip_tunnel *tunnel = netdev_priv(dev);
 478        struct ip_tunnel_info *tun_info;
 479        const struct ip_tunnel_key *key;
 480        int tunnel_hlen;
 481        __be16 flags;
 482
 483        tun_info = skb_tunnel_info(skb);
 484        if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
 485                     ip_tunnel_info_af(tun_info) != AF_INET))
 486                goto err_free_skb;
 487
 488        key = &tun_info->key;
 489        tunnel_hlen = gre_calc_hlen(key->tun_flags);
 490
 491        if (skb_cow_head(skb, dev->needed_headroom))
 492                goto err_free_skb;
 493
 494        /* Push Tunnel header. */
 495        if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
 496                goto err_free_skb;
 497
 498        flags = tun_info->key.tun_flags &
 499                (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
 500        gre_build_header(skb, tunnel_hlen, flags, proto,
 501                         tunnel_id_to_key32(tun_info->key.tun_id),
 502                         (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0);
 503
 504        ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
 505
 506        return;
 507
 508err_free_skb:
 509        kfree_skb(skb);
 510        dev->stats.tx_dropped++;
 511}
 512
 513static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 514{
 515        struct ip_tunnel *tunnel = netdev_priv(dev);
 516        struct ip_tunnel_info *tun_info;
 517        const struct ip_tunnel_key *key;
 518        struct erspan_metadata *md;
 519        bool truncate = false;
 520        __be16 proto;
 521        int tunnel_hlen;
 522        int version;
 523        int nhoff;
 524        int thoff;
 525
 526        tun_info = skb_tunnel_info(skb);
 527        if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
 528                     ip_tunnel_info_af(tun_info) != AF_INET))
 529                goto err_free_skb;
 530
 531        key = &tun_info->key;
 532        if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
 533                goto err_free_skb;
 534        if (tun_info->options_len < sizeof(*md))
 535                goto err_free_skb;
 536        md = ip_tunnel_info_opts(tun_info);
 537
 538        /* ERSPAN has fixed 8 byte GRE header */
 539        version = md->version;
 540        tunnel_hlen = 8 + erspan_hdr_len(version);
 541
 542        if (skb_cow_head(skb, dev->needed_headroom))
 543                goto err_free_skb;
 544
 545        if (gre_handle_offloads(skb, false))
 546                goto err_free_skb;
 547
 548        if (skb->len > dev->mtu + dev->hard_header_len) {
 549                pskb_trim(skb, dev->mtu + dev->hard_header_len);
 550                truncate = true;
 551        }
 552
 553        nhoff = skb_network_header(skb) - skb_mac_header(skb);
 554        if (skb->protocol == htons(ETH_P_IP) &&
 555            (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
 556                truncate = true;
 557
 558        thoff = skb_transport_header(skb) - skb_mac_header(skb);
 559        if (skb->protocol == htons(ETH_P_IPV6) &&
 560            (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff))
 561                truncate = true;
 562
 563        if (version == 1) {
 564                erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
 565                                    ntohl(md->u.index), truncate, true);
 566                proto = htons(ETH_P_ERSPAN);
 567        } else if (version == 2) {
 568                erspan_build_header_v2(skb,
 569                                       ntohl(tunnel_id_to_key32(key->tun_id)),
 570                                       md->u.md2.dir,
 571                                       get_hwid(&md->u.md2),
 572                                       truncate, true);
 573                proto = htons(ETH_P_ERSPAN2);
 574        } else {
 575                goto err_free_skb;
 576        }
 577
 578        gre_build_header(skb, 8, TUNNEL_SEQ,
 579                         proto, 0, htonl(tunnel->o_seqno++));
 580
 581        ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
 582
 583        return;
 584
 585err_free_skb:
 586        kfree_skb(skb);
 587        dev->stats.tx_dropped++;
 588}
 589
 590static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 591{
 592        struct ip_tunnel_info *info = skb_tunnel_info(skb);
 593        const struct ip_tunnel_key *key;
 594        struct rtable *rt;
 595        struct flowi4 fl4;
 596
 597        if (ip_tunnel_info_af(info) != AF_INET)
 598                return -EINVAL;
 599
 600        key = &info->key;
 601        ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src,
 602                            tunnel_id_to_key32(key->tun_id), key->tos, 0,
 603                            skb->mark, skb_get_hash(skb));
 604        rt = ip_route_output_key(dev_net(dev), &fl4);
 605        if (IS_ERR(rt))
 606                return PTR_ERR(rt);
 607
 608        ip_rt_put(rt);
 609        info->key.u.ipv4.src = fl4.saddr;
 610        return 0;
 611}
 612
 613static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
 614                              struct net_device *dev)
 615{
 616        struct ip_tunnel *tunnel = netdev_priv(dev);
 617        const struct iphdr *tnl_params;
 618
 619        if (!pskb_inet_may_pull(skb))
 620                goto free_skb;
 621
 622        if (tunnel->collect_md) {
 623                gre_fb_xmit(skb, dev, skb->protocol);
 624                return NETDEV_TX_OK;
 625        }
 626
 627        if (dev->header_ops) {
 628                if (skb_cow_head(skb, 0))
 629                        goto free_skb;
 630
 631                tnl_params = (const struct iphdr *)skb->data;
 632
 633                /* Pull skb since ip_tunnel_xmit() needs skb->data pointing
 634                 * to gre header.
 635                 */
 636                skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
 637                skb_reset_mac_header(skb);
 638        } else {
 639                if (skb_cow_head(skb, dev->needed_headroom))
 640                        goto free_skb;
 641
 642                tnl_params = &tunnel->parms.iph;
 643        }
 644
 645        if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
 646                goto free_skb;
 647
 648        __gre_xmit(skb, dev, tnl_params, skb->protocol);
 649        return NETDEV_TX_OK;
 650
 651free_skb:
 652        kfree_skb(skb);
 653        dev->stats.tx_dropped++;
 654        return NETDEV_TX_OK;
 655}
 656
 657static netdev_tx_t erspan_xmit(struct sk_buff *skb,
 658                               struct net_device *dev)
 659{
 660        struct ip_tunnel *tunnel = netdev_priv(dev);
 661        bool truncate = false;
 662        __be16 proto;
 663
 664        if (!pskb_inet_may_pull(skb))
 665                goto free_skb;
 666
 667        if (tunnel->collect_md) {
 668                erspan_fb_xmit(skb, dev);
 669                return NETDEV_TX_OK;
 670        }
 671
 672        if (gre_handle_offloads(skb, false))
 673                goto free_skb;
 674
 675        if (skb_cow_head(skb, dev->needed_headroom))
 676                goto free_skb;
 677
 678        if (skb->len > dev->mtu + dev->hard_header_len) {
 679                pskb_trim(skb, dev->mtu + dev->hard_header_len);
 680                truncate = true;
 681        }
 682
 683        /* Push ERSPAN header */
 684        if (tunnel->erspan_ver == 0) {
 685                proto = htons(ETH_P_ERSPAN);
 686                tunnel->parms.o_flags &= ~TUNNEL_SEQ;
 687        } else if (tunnel->erspan_ver == 1) {
 688                erspan_build_header(skb, ntohl(tunnel->parms.o_key),
 689                                    tunnel->index,
 690                                    truncate, true);
 691                proto = htons(ETH_P_ERSPAN);
 692        } else if (tunnel->erspan_ver == 2) {
 693                erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key),
 694                                       tunnel->dir, tunnel->hwid,
 695                                       truncate, true);
 696                proto = htons(ETH_P_ERSPAN2);
 697        } else {
 698                goto free_skb;
 699        }
 700
 701        tunnel->parms.o_flags &= ~TUNNEL_KEY;
 702        __gre_xmit(skb, dev, &tunnel->parms.iph, proto);
 703        return NETDEV_TX_OK;
 704
 705free_skb:
 706        kfree_skb(skb);
 707        dev->stats.tx_dropped++;
 708        return NETDEV_TX_OK;
 709}
 710
 711static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
 712                                struct net_device *dev)
 713{
 714        struct ip_tunnel *tunnel = netdev_priv(dev);
 715
 716        if (!pskb_inet_may_pull(skb))
 717                goto free_skb;
 718
 719        if (tunnel->collect_md) {
 720                gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
 721                return NETDEV_TX_OK;
 722        }
 723
 724        if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
 725                goto free_skb;
 726
 727        if (skb_cow_head(skb, dev->needed_headroom))
 728                goto free_skb;
 729
 730        __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
 731        return NETDEV_TX_OK;
 732
 733free_skb:
 734        kfree_skb(skb);
 735        dev->stats.tx_dropped++;
 736        return NETDEV_TX_OK;
 737}
 738
 739static void ipgre_link_update(struct net_device *dev, bool set_mtu)
 740{
 741        struct ip_tunnel *tunnel = netdev_priv(dev);
 742        int len;
 743
 744        len = tunnel->tun_hlen;
 745        tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
 746        len = tunnel->tun_hlen - len;
 747        tunnel->hlen = tunnel->hlen + len;
 748
 749        if (dev->header_ops)
 750                dev->hard_header_len += len;
 751        else
 752                dev->needed_headroom += len;
 753
 754        if (set_mtu)
 755                dev->mtu = max_t(int, dev->mtu - len, 68);
 756
 757        if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
 758                if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
 759                    tunnel->encap.type == TUNNEL_ENCAP_NONE) {
 760                        dev->features |= NETIF_F_GSO_SOFTWARE;
 761                        dev->hw_features |= NETIF_F_GSO_SOFTWARE;
 762                } else {
 763                        dev->features &= ~NETIF_F_GSO_SOFTWARE;
 764                        dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
 765                }
 766                dev->features |= NETIF_F_LLTX;
 767        } else {
 768                dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
 769                dev->features &= ~(NETIF_F_LLTX | NETIF_F_GSO_SOFTWARE);
 770        }
 771}
 772
 773static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p,
 774                            int cmd)
 775{
 776        int err;
 777
 778        if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
 779                if (p->iph.version != 4 || p->iph.protocol != IPPROTO_GRE ||
 780                    p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)) ||
 781                    ((p->i_flags | p->o_flags) & (GRE_VERSION | GRE_ROUTING)))
 782                        return -EINVAL;
 783        }
 784
 785        p->i_flags = gre_flags_to_tnl_flags(p->i_flags);
 786        p->o_flags = gre_flags_to_tnl_flags(p->o_flags);
 787
 788        err = ip_tunnel_ctl(dev, p, cmd);
 789        if (err)
 790                return err;
 791
 792        if (cmd == SIOCCHGTUNNEL) {
 793                struct ip_tunnel *t = netdev_priv(dev);
 794
 795                t->parms.i_flags = p->i_flags;
 796                t->parms.o_flags = p->o_flags;
 797
 798                if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
 799                        ipgre_link_update(dev, true);
 800        }
 801
 802        p->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags);
 803        p->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags);
 804        return 0;
 805}
 806
 807/* Nice toy. Unfortunately, useless in real life :-)
 808   It allows to construct virtual multiprotocol broadcast "LAN"
 809   over the Internet, provided multicast routing is tuned.
 810
 811
 812   I have no idea was this bicycle invented before me,
 813   so that I had to set ARPHRD_IPGRE to a random value.
 814   I have an impression, that Cisco could make something similar,
 815   but this feature is apparently missing in IOS<=11.2(8).
 816
 817   I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
 818   with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
 819
 820   ping -t 255 224.66.66.66
 821
 822   If nobody answers, mbone does not work.
 823
 824   ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
 825   ip addr add 10.66.66.<somewhat>/24 dev Universe
 826   ifconfig Universe up
 827   ifconfig Universe add fe80::<Your_real_addr>/10
 828   ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
 829   ftp 10.66.66.66
 830   ...
 831   ftp fec0:6666:6666::193.233.7.65
 832   ...
 833 */
 834static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
 835                        unsigned short type,
 836                        const void *daddr, const void *saddr, unsigned int len)
 837{
 838        struct ip_tunnel *t = netdev_priv(dev);
 839        struct iphdr *iph;
 840        struct gre_base_hdr *greh;
 841
 842        iph = skb_push(skb, t->hlen + sizeof(*iph));
 843        greh = (struct gre_base_hdr *)(iph+1);
 844        greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
 845        greh->protocol = htons(type);
 846
 847        memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
 848
 849        /* Set the source hardware address. */
 850        if (saddr)
 851                memcpy(&iph->saddr, saddr, 4);
 852        if (daddr)
 853                memcpy(&iph->daddr, daddr, 4);
 854        if (iph->daddr)
 855                return t->hlen + sizeof(*iph);
 856
 857        return -(t->hlen + sizeof(*iph));
 858}
 859
 860static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
 861{
 862        const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
 863        memcpy(haddr, &iph->saddr, 4);
 864        return 4;
 865}
 866
 867static const struct header_ops ipgre_header_ops = {
 868        .create = ipgre_header,
 869        .parse  = ipgre_header_parse,
 870};
 871
 872#ifdef CONFIG_NET_IPGRE_BROADCAST
 873static int ipgre_open(struct net_device *dev)
 874{
 875        struct ip_tunnel *t = netdev_priv(dev);
 876
 877        if (ipv4_is_multicast(t->parms.iph.daddr)) {
 878                struct flowi4 fl4;
 879                struct rtable *rt;
 880
 881                rt = ip_route_output_gre(t->net, &fl4,
 882                                         t->parms.iph.daddr,
 883                                         t->parms.iph.saddr,
 884                                         t->parms.o_key,
 885                                         RT_TOS(t->parms.iph.tos),
 886                                         t->parms.link);
 887                if (IS_ERR(rt))
 888                        return -EADDRNOTAVAIL;
 889                dev = rt->dst.dev;
 890                ip_rt_put(rt);
 891                if (!__in_dev_get_rtnl(dev))
 892                        return -EADDRNOTAVAIL;
 893                t->mlink = dev->ifindex;
 894                ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
 895        }
 896        return 0;
 897}
 898
 899static int ipgre_close(struct net_device *dev)
 900{
 901        struct ip_tunnel *t = netdev_priv(dev);
 902
 903        if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
 904                struct in_device *in_dev;
 905                in_dev = inetdev_by_index(t->net, t->mlink);
 906                if (in_dev)
 907                        ip_mc_dec_group(in_dev, t->parms.iph.daddr);
 908        }
 909        return 0;
 910}
 911#endif
 912
 913static const struct net_device_ops ipgre_netdev_ops = {
 914        .ndo_init               = ipgre_tunnel_init,
 915        .ndo_uninit             = ip_tunnel_uninit,
 916#ifdef CONFIG_NET_IPGRE_BROADCAST
 917        .ndo_open               = ipgre_open,
 918        .ndo_stop               = ipgre_close,
 919#endif
 920        .ndo_start_xmit         = ipgre_xmit,
 921        .ndo_do_ioctl           = ip_tunnel_ioctl,
 922        .ndo_change_mtu         = ip_tunnel_change_mtu,
 923        .ndo_get_stats64        = dev_get_tstats64,
 924        .ndo_get_iflink         = ip_tunnel_get_iflink,
 925        .ndo_tunnel_ctl         = ipgre_tunnel_ctl,
 926};
 927
 928#define GRE_FEATURES (NETIF_F_SG |              \
 929                      NETIF_F_FRAGLIST |        \
 930                      NETIF_F_HIGHDMA |         \
 931                      NETIF_F_HW_CSUM)
 932
 933static void ipgre_tunnel_setup(struct net_device *dev)
 934{
 935        dev->netdev_ops         = &ipgre_netdev_ops;
 936        dev->type               = ARPHRD_IPGRE;
 937        ip_tunnel_setup(dev, ipgre_net_id);
 938}
 939
 940static void __gre_tunnel_init(struct net_device *dev)
 941{
 942        struct ip_tunnel *tunnel;
 943
 944        tunnel = netdev_priv(dev);
 945        tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
 946        tunnel->parms.iph.protocol = IPPROTO_GRE;
 947
 948        tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
 949        dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph);
 950
 951        dev->features           |= GRE_FEATURES;
 952        dev->hw_features        |= GRE_FEATURES;
 953
 954        if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
 955                /* TCP offload with GRE SEQ is not supported, nor
 956                 * can we support 2 levels of outer headers requiring
 957                 * an update.
 958                 */
 959                if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
 960                    (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
 961                        dev->features    |= NETIF_F_GSO_SOFTWARE;
 962                        dev->hw_features |= NETIF_F_GSO_SOFTWARE;
 963                }
 964
 965                /* Can use a lockless transmit, unless we generate
 966                 * output sequences
 967                 */
 968                dev->features |= NETIF_F_LLTX;
 969        }
 970}
 971
 972static int ipgre_tunnel_init(struct net_device *dev)
 973{
 974        struct ip_tunnel *tunnel = netdev_priv(dev);
 975        struct iphdr *iph = &tunnel->parms.iph;
 976
 977        __gre_tunnel_init(dev);
 978
 979        memcpy(dev->dev_addr, &iph->saddr, 4);
 980        memcpy(dev->broadcast, &iph->daddr, 4);
 981
 982        dev->flags              = IFF_NOARP;
 983        netif_keep_dst(dev);
 984        dev->addr_len           = 4;
 985
 986        if (iph->daddr && !tunnel->collect_md) {
 987#ifdef CONFIG_NET_IPGRE_BROADCAST
 988                if (ipv4_is_multicast(iph->daddr)) {
 989                        if (!iph->saddr)
 990                                return -EINVAL;
 991                        dev->flags = IFF_BROADCAST;
 992                        dev->header_ops = &ipgre_header_ops;
 993                        dev->hard_header_len = tunnel->hlen + sizeof(*iph);
 994                        dev->needed_headroom = 0;
 995                }
 996#endif
 997        } else if (!tunnel->collect_md) {
 998                dev->header_ops = &ipgre_header_ops;
 999                dev->hard_header_len = tunnel->hlen + sizeof(*iph);
1000                dev->needed_headroom = 0;
1001        }
1002
1003        return ip_tunnel_init(dev);
1004}
1005
1006static const struct gre_protocol ipgre_protocol = {
1007        .handler     = gre_rcv,
1008        .err_handler = gre_err,
1009};
1010
1011static int __net_init ipgre_init_net(struct net *net)
1012{
1013        return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
1014}
1015
1016static void __net_exit ipgre_exit_batch_net(struct list_head *list_net)
1017{
1018        ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops);
1019}
1020
1021static struct pernet_operations ipgre_net_ops = {
1022        .init = ipgre_init_net,
1023        .exit_batch = ipgre_exit_batch_net,
1024        .id   = &ipgre_net_id,
1025        .size = sizeof(struct ip_tunnel_net),
1026};
1027
1028static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
1029                                 struct netlink_ext_ack *extack)
1030{
1031        __be16 flags;
1032
1033        if (!data)
1034                return 0;
1035
1036        flags = 0;
1037        if (data[IFLA_GRE_IFLAGS])
1038                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1039        if (data[IFLA_GRE_OFLAGS])
1040                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1041        if (flags & (GRE_VERSION|GRE_ROUTING))
1042                return -EINVAL;
1043
1044        if (data[IFLA_GRE_COLLECT_METADATA] &&
1045            data[IFLA_GRE_ENCAP_TYPE] &&
1046            nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE)
1047                return -EINVAL;
1048
1049        return 0;
1050}
1051
1052static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
1053                              struct netlink_ext_ack *extack)
1054{
1055        __be32 daddr;
1056
1057        if (tb[IFLA_ADDRESS]) {
1058                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1059                        return -EINVAL;
1060                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1061                        return -EADDRNOTAVAIL;
1062        }
1063
1064        if (!data)
1065                goto out;
1066
1067        if (data[IFLA_GRE_REMOTE]) {
1068                memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1069                if (!daddr)
1070                        return -EINVAL;
1071        }
1072
1073out:
1074        return ipgre_tunnel_validate(tb, data, extack);
1075}
1076
1077static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
1078                           struct netlink_ext_ack *extack)
1079{
1080        __be16 flags = 0;
1081        int ret;
1082
1083        if (!data)
1084                return 0;
1085
1086        ret = ipgre_tap_validate(tb, data, extack);
1087        if (ret)
1088                return ret;
1089
1090        if (data[IFLA_GRE_ERSPAN_VER] &&
1091            nla_get_u8(data[IFLA_GRE_ERSPAN_VER]) == 0)
1092                return 0;
1093
1094        /* ERSPAN type II/III should only have GRE sequence and key flag */
1095        if (data[IFLA_GRE_OFLAGS])
1096                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1097        if (data[IFLA_GRE_IFLAGS])
1098                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1099        if (!data[IFLA_GRE_COLLECT_METADATA] &&
1100            flags != (GRE_SEQ | GRE_KEY))
1101                return -EINVAL;
1102
1103        /* ERSPAN Session ID only has 10-bit. Since we reuse
1104         * 32-bit key field as ID, check it's range.
1105         */
1106        if (data[IFLA_GRE_IKEY] &&
1107            (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
1108                return -EINVAL;
1109
1110        if (data[IFLA_GRE_OKEY] &&
1111            (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
1112                return -EINVAL;
1113
1114        return 0;
1115}
1116
1117static int ipgre_netlink_parms(struct net_device *dev,
1118                                struct nlattr *data[],
1119                                struct nlattr *tb[],
1120                                struct ip_tunnel_parm *parms,
1121                                __u32 *fwmark)
1122{
1123        struct ip_tunnel *t = netdev_priv(dev);
1124
1125        memset(parms, 0, sizeof(*parms));
1126
1127        parms->iph.protocol = IPPROTO_GRE;
1128
1129        if (!data)
1130                return 0;
1131
1132        if (data[IFLA_GRE_LINK])
1133                parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1134
1135        if (data[IFLA_GRE_IFLAGS])
1136                parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
1137
1138        if (data[IFLA_GRE_OFLAGS])
1139                parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
1140
1141        if (data[IFLA_GRE_IKEY])
1142                parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1143
1144        if (data[IFLA_GRE_OKEY])
1145                parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1146
1147        if (data[IFLA_GRE_LOCAL])
1148                parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
1149
1150        if (data[IFLA_GRE_REMOTE])
1151                parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
1152
1153        if (data[IFLA_GRE_TTL])
1154                parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1155
1156        if (data[IFLA_GRE_TOS])
1157                parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1158
1159        if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) {
1160                if (t->ignore_df)
1161                        return -EINVAL;
1162                parms->iph.frag_off = htons(IP_DF);
1163        }
1164
1165        if (data[IFLA_GRE_COLLECT_METADATA]) {
1166                t->collect_md = true;
1167                if (dev->type == ARPHRD_IPGRE)
1168                        dev->type = ARPHRD_NONE;
1169        }
1170
1171        if (data[IFLA_GRE_IGNORE_DF]) {
1172                if (nla_get_u8(data[IFLA_GRE_IGNORE_DF])
1173                  && (parms->iph.frag_off & htons(IP_DF)))
1174                        return -EINVAL;
1175                t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
1176        }
1177
1178        if (data[IFLA_GRE_FWMARK])
1179                *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
1180
1181        return 0;
1182}
1183
1184static int erspan_netlink_parms(struct net_device *dev,
1185                                struct nlattr *data[],
1186                                struct nlattr *tb[],
1187                                struct ip_tunnel_parm *parms,
1188                                __u32 *fwmark)
1189{
1190        struct ip_tunnel *t = netdev_priv(dev);
1191        int err;
1192
1193        err = ipgre_netlink_parms(dev, data, tb, parms, fwmark);
1194        if (err)
1195                return err;
1196        if (!data)
1197                return 0;
1198
1199        if (data[IFLA_GRE_ERSPAN_VER]) {
1200                t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
1201
1202                if (t->erspan_ver > 2)
1203                        return -EINVAL;
1204        }
1205
1206        if (t->erspan_ver == 1) {
1207                if (data[IFLA_GRE_ERSPAN_INDEX]) {
1208                        t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
1209                        if (t->index & ~INDEX_MASK)
1210                                return -EINVAL;
1211                }
1212        } else if (t->erspan_ver == 2) {
1213                if (data[IFLA_GRE_ERSPAN_DIR]) {
1214                        t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
1215                        if (t->dir & ~(DIR_MASK >> DIR_OFFSET))
1216                                return -EINVAL;
1217                }
1218                if (data[IFLA_GRE_ERSPAN_HWID]) {
1219                        t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
1220                        if (t->hwid & ~(HWID_MASK >> HWID_OFFSET))
1221                                return -EINVAL;
1222                }
1223        }
1224
1225        return 0;
1226}
1227
1228/* This function returns true when ENCAP attributes are present in the nl msg */
1229static bool ipgre_netlink_encap_parms(struct nlattr *data[],
1230                                      struct ip_tunnel_encap *ipencap)
1231{
1232        bool ret = false;
1233
1234        memset(ipencap, 0, sizeof(*ipencap));
1235
1236        if (!data)
1237                return ret;
1238
1239        if (data[IFLA_GRE_ENCAP_TYPE]) {
1240                ret = true;
1241                ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
1242        }
1243
1244        if (data[IFLA_GRE_ENCAP_FLAGS]) {
1245                ret = true;
1246                ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
1247        }
1248
1249        if (data[IFLA_GRE_ENCAP_SPORT]) {
1250                ret = true;
1251                ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
1252        }
1253
1254        if (data[IFLA_GRE_ENCAP_DPORT]) {
1255                ret = true;
1256                ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
1257        }
1258
1259        return ret;
1260}
1261
1262static int gre_tap_init(struct net_device *dev)
1263{
1264        __gre_tunnel_init(dev);
1265        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1266        netif_keep_dst(dev);
1267
1268        return ip_tunnel_init(dev);
1269}
1270
1271static const struct net_device_ops gre_tap_netdev_ops = {
1272        .ndo_init               = gre_tap_init,
1273        .ndo_uninit             = ip_tunnel_uninit,
1274        .ndo_start_xmit         = gre_tap_xmit,
1275        .ndo_set_mac_address    = eth_mac_addr,
1276        .ndo_validate_addr      = eth_validate_addr,
1277        .ndo_change_mtu         = ip_tunnel_change_mtu,
1278        .ndo_get_stats64        = dev_get_tstats64,
1279        .ndo_get_iflink         = ip_tunnel_get_iflink,
1280        .ndo_fill_metadata_dst  = gre_fill_metadata_dst,
1281};
1282
1283static int erspan_tunnel_init(struct net_device *dev)
1284{
1285        struct ip_tunnel *tunnel = netdev_priv(dev);
1286
1287        if (tunnel->erspan_ver == 0)
1288                tunnel->tun_hlen = 4; /* 4-byte GRE hdr. */
1289        else
1290                tunnel->tun_hlen = 8; /* 8-byte GRE hdr. */
1291
1292        tunnel->parms.iph.protocol = IPPROTO_GRE;
1293        tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
1294                       erspan_hdr_len(tunnel->erspan_ver);
1295
1296        dev->features           |= GRE_FEATURES;
1297        dev->hw_features        |= GRE_FEATURES;
1298        dev->priv_flags         |= IFF_LIVE_ADDR_CHANGE;
1299        netif_keep_dst(dev);
1300
1301        return ip_tunnel_init(dev);
1302}
1303
1304static const struct net_device_ops erspan_netdev_ops = {
1305        .ndo_init               = erspan_tunnel_init,
1306        .ndo_uninit             = ip_tunnel_uninit,
1307        .ndo_start_xmit         = erspan_xmit,
1308        .ndo_set_mac_address    = eth_mac_addr,
1309        .ndo_validate_addr      = eth_validate_addr,
1310        .ndo_change_mtu         = ip_tunnel_change_mtu,
1311        .ndo_get_stats64        = dev_get_tstats64,
1312        .ndo_get_iflink         = ip_tunnel_get_iflink,
1313        .ndo_fill_metadata_dst  = gre_fill_metadata_dst,
1314};
1315
1316static void ipgre_tap_setup(struct net_device *dev)
1317{
1318        ether_setup(dev);
1319        dev->max_mtu = 0;
1320        dev->netdev_ops = &gre_tap_netdev_ops;
1321        dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1322        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1323        ip_tunnel_setup(dev, gre_tap_net_id);
1324}
1325
1326static int
1327ipgre_newlink_encap_setup(struct net_device *dev, struct nlattr *data[])
1328{
1329        struct ip_tunnel_encap ipencap;
1330
1331        if (ipgre_netlink_encap_parms(data, &ipencap)) {
1332                struct ip_tunnel *t = netdev_priv(dev);
1333                int err = ip_tunnel_encap_setup(t, &ipencap);
1334
1335                if (err < 0)
1336                        return err;
1337        }
1338
1339        return 0;
1340}
1341
1342static int ipgre_newlink(struct net *src_net, struct net_device *dev,
1343                         struct nlattr *tb[], struct nlattr *data[],
1344                         struct netlink_ext_ack *extack)
1345{
1346        struct ip_tunnel_parm p;
1347        __u32 fwmark = 0;
1348        int err;
1349
1350        err = ipgre_newlink_encap_setup(dev, data);
1351        if (err)
1352                return err;
1353
1354        err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1355        if (err < 0)
1356                return err;
1357        return ip_tunnel_newlink(dev, tb, &p, fwmark);
1358}
1359
1360static int erspan_newlink(struct net *src_net, struct net_device *dev,
1361                          struct nlattr *tb[], struct nlattr *data[],
1362                          struct netlink_ext_ack *extack)
1363{
1364        struct ip_tunnel_parm p;
1365        __u32 fwmark = 0;
1366        int err;
1367
1368        err = ipgre_newlink_encap_setup(dev, data);
1369        if (err)
1370                return err;
1371
1372        err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
1373        if (err)
1374                return err;
1375        return ip_tunnel_newlink(dev, tb, &p, fwmark);
1376}
1377
1378static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1379                            struct nlattr *data[],
1380                            struct netlink_ext_ack *extack)
1381{
1382        struct ip_tunnel *t = netdev_priv(dev);
1383        __u32 fwmark = t->fwmark;
1384        struct ip_tunnel_parm p;
1385        int err;
1386
1387        err = ipgre_newlink_encap_setup(dev, data);
1388        if (err)
1389                return err;
1390
1391        err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1392        if (err < 0)
1393                return err;
1394
1395        err = ip_tunnel_changelink(dev, tb, &p, fwmark);
1396        if (err < 0)
1397                return err;
1398
1399        t->parms.i_flags = p.i_flags;
1400        t->parms.o_flags = p.o_flags;
1401
1402        ipgre_link_update(dev, !tb[IFLA_MTU]);
1403
1404        return 0;
1405}
1406
1407static int erspan_changelink(struct net_device *dev, struct nlattr *tb[],
1408                             struct nlattr *data[],
1409                             struct netlink_ext_ack *extack)
1410{
1411        struct ip_tunnel *t = netdev_priv(dev);
1412        __u32 fwmark = t->fwmark;
1413        struct ip_tunnel_parm p;
1414        int err;
1415
1416        err = ipgre_newlink_encap_setup(dev, data);
1417        if (err)
1418                return err;
1419
1420        err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
1421        if (err < 0)
1422                return err;
1423
1424        err = ip_tunnel_changelink(dev, tb, &p, fwmark);
1425        if (err < 0)
1426                return err;
1427
1428        t->parms.i_flags = p.i_flags;
1429        t->parms.o_flags = p.o_flags;
1430
1431        return 0;
1432}
1433
1434static size_t ipgre_get_size(const struct net_device *dev)
1435{
1436        return
1437                /* IFLA_GRE_LINK */
1438                nla_total_size(4) +
1439                /* IFLA_GRE_IFLAGS */
1440                nla_total_size(2) +
1441                /* IFLA_GRE_OFLAGS */
1442                nla_total_size(2) +
1443                /* IFLA_GRE_IKEY */
1444                nla_total_size(4) +
1445                /* IFLA_GRE_OKEY */
1446                nla_total_size(4) +
1447                /* IFLA_GRE_LOCAL */
1448                nla_total_size(4) +
1449                /* IFLA_GRE_REMOTE */
1450                nla_total_size(4) +
1451                /* IFLA_GRE_TTL */
1452                nla_total_size(1) +
1453                /* IFLA_GRE_TOS */
1454                nla_total_size(1) +
1455                /* IFLA_GRE_PMTUDISC */
1456                nla_total_size(1) +
1457                /* IFLA_GRE_ENCAP_TYPE */
1458                nla_total_size(2) +
1459                /* IFLA_GRE_ENCAP_FLAGS */
1460                nla_total_size(2) +
1461                /* IFLA_GRE_ENCAP_SPORT */
1462                nla_total_size(2) +
1463                /* IFLA_GRE_ENCAP_DPORT */
1464                nla_total_size(2) +
1465                /* IFLA_GRE_COLLECT_METADATA */
1466                nla_total_size(0) +
1467                /* IFLA_GRE_IGNORE_DF */
1468                nla_total_size(1) +
1469                /* IFLA_GRE_FWMARK */
1470                nla_total_size(4) +
1471                /* IFLA_GRE_ERSPAN_INDEX */
1472                nla_total_size(4) +
1473                /* IFLA_GRE_ERSPAN_VER */
1474                nla_total_size(1) +
1475                /* IFLA_GRE_ERSPAN_DIR */
1476                nla_total_size(1) +
1477                /* IFLA_GRE_ERSPAN_HWID */
1478                nla_total_size(2) +
1479                0;
1480}
1481
1482static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1483{
1484        struct ip_tunnel *t = netdev_priv(dev);
1485        struct ip_tunnel_parm *p = &t->parms;
1486        __be16 o_flags = p->o_flags;
1487
1488        if (t->erspan_ver <= 2) {
1489                if (t->erspan_ver != 0 && !t->collect_md)
1490                        o_flags |= TUNNEL_KEY;
1491
1492                if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
1493                        goto nla_put_failure;
1494
1495                if (t->erspan_ver == 1) {
1496                        if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
1497                                goto nla_put_failure;
1498                } else if (t->erspan_ver == 2) {
1499                        if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
1500                                goto nla_put_failure;
1501                        if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
1502                                goto nla_put_failure;
1503                }
1504        }
1505
1506        if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1507            nla_put_be16(skb, IFLA_GRE_IFLAGS,
1508                         gre_tnl_flags_to_gre_flags(p->i_flags)) ||
1509            nla_put_be16(skb, IFLA_GRE_OFLAGS,
1510                         gre_tnl_flags_to_gre_flags(o_flags)) ||
1511            nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1512            nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1513            nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1514            nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1515            nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1516            nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1517            nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1518                       !!(p->iph.frag_off & htons(IP_DF))) ||
1519            nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark))
1520                goto nla_put_failure;
1521
1522        if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
1523                        t->encap.type) ||
1524            nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
1525                         t->encap.sport) ||
1526            nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
1527                         t->encap.dport) ||
1528            nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
1529                        t->encap.flags))
1530                goto nla_put_failure;
1531
1532        if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df))
1533                goto nla_put_failure;
1534
1535        if (t->collect_md) {
1536                if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
1537                        goto nla_put_failure;
1538        }
1539
1540        return 0;
1541
1542nla_put_failure:
1543        return -EMSGSIZE;
1544}
1545
1546static void erspan_setup(struct net_device *dev)
1547{
1548        struct ip_tunnel *t = netdev_priv(dev);
1549
1550        ether_setup(dev);
1551        dev->max_mtu = 0;
1552        dev->netdev_ops = &erspan_netdev_ops;
1553        dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1554        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1555        ip_tunnel_setup(dev, erspan_net_id);
1556        t->erspan_ver = 1;
1557}
1558
1559static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1560        [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1561        [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1562        [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1563        [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1564        [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1565        [IFLA_GRE_LOCAL]        = { .len = sizeof_field(struct iphdr, saddr) },
1566        [IFLA_GRE_REMOTE]       = { .len = sizeof_field(struct iphdr, daddr) },
1567        [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1568        [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1569        [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1570        [IFLA_GRE_ENCAP_TYPE]   = { .type = NLA_U16 },
1571        [IFLA_GRE_ENCAP_FLAGS]  = { .type = NLA_U16 },
1572        [IFLA_GRE_ENCAP_SPORT]  = { .type = NLA_U16 },
1573        [IFLA_GRE_ENCAP_DPORT]  = { .type = NLA_U16 },
1574        [IFLA_GRE_COLLECT_METADATA]     = { .type = NLA_FLAG },
1575        [IFLA_GRE_IGNORE_DF]    = { .type = NLA_U8 },
1576        [IFLA_GRE_FWMARK]       = { .type = NLA_U32 },
1577        [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 },
1578        [IFLA_GRE_ERSPAN_VER]   = { .type = NLA_U8 },
1579        [IFLA_GRE_ERSPAN_DIR]   = { .type = NLA_U8 },
1580        [IFLA_GRE_ERSPAN_HWID]  = { .type = NLA_U16 },
1581};
1582
1583static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1584        .kind           = "gre",
1585        .maxtype        = IFLA_GRE_MAX,
1586        .policy         = ipgre_policy,
1587        .priv_size      = sizeof(struct ip_tunnel),
1588        .setup          = ipgre_tunnel_setup,
1589        .validate       = ipgre_tunnel_validate,
1590        .newlink        = ipgre_newlink,
1591        .changelink     = ipgre_changelink,
1592        .dellink        = ip_tunnel_dellink,
1593        .get_size       = ipgre_get_size,
1594        .fill_info      = ipgre_fill_info,
1595        .get_link_net   = ip_tunnel_get_link_net,
1596};
1597
1598static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1599        .kind           = "gretap",
1600        .maxtype        = IFLA_GRE_MAX,
1601        .policy         = ipgre_policy,
1602        .priv_size      = sizeof(struct ip_tunnel),
1603        .setup          = ipgre_tap_setup,
1604        .validate       = ipgre_tap_validate,
1605        .newlink        = ipgre_newlink,
1606        .changelink     = ipgre_changelink,
1607        .dellink        = ip_tunnel_dellink,
1608        .get_size       = ipgre_get_size,
1609        .fill_info      = ipgre_fill_info,
1610        .get_link_net   = ip_tunnel_get_link_net,
1611};
1612
1613static struct rtnl_link_ops erspan_link_ops __read_mostly = {
1614        .kind           = "erspan",
1615        .maxtype        = IFLA_GRE_MAX,
1616        .policy         = ipgre_policy,
1617        .priv_size      = sizeof(struct ip_tunnel),
1618        .setup          = erspan_setup,
1619        .validate       = erspan_validate,
1620        .newlink        = erspan_newlink,
1621        .changelink     = erspan_changelink,
1622        .dellink        = ip_tunnel_dellink,
1623        .get_size       = ipgre_get_size,
1624        .fill_info      = ipgre_fill_info,
1625        .get_link_net   = ip_tunnel_get_link_net,
1626};
1627
1628struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
1629                                        u8 name_assign_type)
1630{
1631        struct nlattr *tb[IFLA_MAX + 1];
1632        struct net_device *dev;
1633        LIST_HEAD(list_kill);
1634        struct ip_tunnel *t;
1635        int err;
1636
1637        memset(&tb, 0, sizeof(tb));
1638
1639        dev = rtnl_create_link(net, name, name_assign_type,
1640                               &ipgre_tap_ops, tb, NULL);
1641        if (IS_ERR(dev))
1642                return dev;
1643
1644        /* Configure flow based GRE device. */
1645        t = netdev_priv(dev);
1646        t->collect_md = true;
1647
1648        err = ipgre_newlink(net, dev, tb, NULL, NULL);
1649        if (err < 0) {
1650                free_netdev(dev);
1651                return ERR_PTR(err);
1652        }
1653
1654        /* openvswitch users expect packet sizes to be unrestricted,
1655         * so set the largest MTU we can.
1656         */
1657        err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
1658        if (err)
1659                goto out;
1660
1661        err = rtnl_configure_link(dev, NULL);
1662        if (err < 0)
1663                goto out;
1664
1665        return dev;
1666out:
1667        ip_tunnel_dellink(dev, &list_kill);
1668        unregister_netdevice_many(&list_kill);
1669        return ERR_PTR(err);
1670}
1671EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
1672
1673static int __net_init ipgre_tap_init_net(struct net *net)
1674{
1675        return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
1676}
1677
1678static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net)
1679{
1680        ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops);
1681}
1682
1683static struct pernet_operations ipgre_tap_net_ops = {
1684        .init = ipgre_tap_init_net,
1685        .exit_batch = ipgre_tap_exit_batch_net,
1686        .id   = &gre_tap_net_id,
1687        .size = sizeof(struct ip_tunnel_net),
1688};
1689
1690static int __net_init erspan_init_net(struct net *net)
1691{
1692        return ip_tunnel_init_net(net, erspan_net_id,
1693                                  &erspan_link_ops, "erspan0");
1694}
1695
1696static void __net_exit erspan_exit_batch_net(struct list_head *net_list)
1697{
1698        ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops);
1699}
1700
1701static struct pernet_operations erspan_net_ops = {
1702        .init = erspan_init_net,
1703        .exit_batch = erspan_exit_batch_net,
1704        .id   = &erspan_net_id,
1705        .size = sizeof(struct ip_tunnel_net),
1706};
1707
1708static int __init ipgre_init(void)
1709{
1710        int err;
1711
1712        pr_info("GRE over IPv4 tunneling driver\n");
1713
1714        err = register_pernet_device(&ipgre_net_ops);
1715        if (err < 0)
1716                return err;
1717
1718        err = register_pernet_device(&ipgre_tap_net_ops);
1719        if (err < 0)
1720                goto pnet_tap_failed;
1721
1722        err = register_pernet_device(&erspan_net_ops);
1723        if (err < 0)
1724                goto pnet_erspan_failed;
1725
1726        err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1727        if (err < 0) {
1728                pr_info("%s: can't add protocol\n", __func__);
1729                goto add_proto_failed;
1730        }
1731
1732        err = rtnl_link_register(&ipgre_link_ops);
1733        if (err < 0)
1734                goto rtnl_link_failed;
1735
1736        err = rtnl_link_register(&ipgre_tap_ops);
1737        if (err < 0)
1738                goto tap_ops_failed;
1739
1740        err = rtnl_link_register(&erspan_link_ops);
1741        if (err < 0)
1742                goto erspan_link_failed;
1743
1744        return 0;
1745
1746erspan_link_failed:
1747        rtnl_link_unregister(&ipgre_tap_ops);
1748tap_ops_failed:
1749        rtnl_link_unregister(&ipgre_link_ops);
1750rtnl_link_failed:
1751        gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1752add_proto_failed:
1753        unregister_pernet_device(&erspan_net_ops);
1754pnet_erspan_failed:
1755        unregister_pernet_device(&ipgre_tap_net_ops);
1756pnet_tap_failed:
1757        unregister_pernet_device(&ipgre_net_ops);
1758        return err;
1759}
1760
1761static void __exit ipgre_fini(void)
1762{
1763        rtnl_link_unregister(&ipgre_tap_ops);
1764        rtnl_link_unregister(&ipgre_link_ops);
1765        rtnl_link_unregister(&erspan_link_ops);
1766        gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1767        unregister_pernet_device(&ipgre_tap_net_ops);
1768        unregister_pernet_device(&ipgre_net_ops);
1769        unregister_pernet_device(&erspan_net_ops);
1770}
1771
1772module_init(ipgre_init);
1773module_exit(ipgre_fini);
1774MODULE_LICENSE("GPL");
1775MODULE_ALIAS_RTNL_LINK("gre");
1776MODULE_ALIAS_RTNL_LINK("gretap");
1777MODULE_ALIAS_RTNL_LINK("erspan");
1778MODULE_ALIAS_NETDEV("gre0");
1779MODULE_ALIAS_NETDEV("gretap0");
1780MODULE_ALIAS_NETDEV("erspan0");
1781