linux/net/ipv4/ip_gre.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *      Linux NET3:     GRE over IP protocol decoder.
   4 *
   5 *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
   6 */
   7
   8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   9
  10#include <linux/capability.h>
  11#include <linux/module.h>
  12#include <linux/types.h>
  13#include <linux/kernel.h>
  14#include <linux/slab.h>
  15#include <linux/uaccess.h>
  16#include <linux/skbuff.h>
  17#include <linux/netdevice.h>
  18#include <linux/in.h>
  19#include <linux/tcp.h>
  20#include <linux/udp.h>
  21#include <linux/if_arp.h>
  22#include <linux/if_vlan.h>
  23#include <linux/init.h>
  24#include <linux/in6.h>
  25#include <linux/inetdevice.h>
  26#include <linux/igmp.h>
  27#include <linux/netfilter_ipv4.h>
  28#include <linux/etherdevice.h>
  29#include <linux/if_ether.h>
  30
  31#include <net/sock.h>
  32#include <net/ip.h>
  33#include <net/icmp.h>
  34#include <net/protocol.h>
  35#include <net/ip_tunnels.h>
  36#include <net/arp.h>
  37#include <net/checksum.h>
  38#include <net/dsfield.h>
  39#include <net/inet_ecn.h>
  40#include <net/xfrm.h>
  41#include <net/net_namespace.h>
  42#include <net/netns/generic.h>
  43#include <net/rtnetlink.h>
  44#include <net/gre.h>
  45#include <net/dst_metadata.h>
  46#include <net/erspan.h>
  47
  48/*
  49   Problems & solutions
  50   --------------------
  51
  52   1. The most important issue is detecting local dead loops.
  53   They would cause complete host lockup in transmit, which
  54   would be "resolved" by stack overflow or, if queueing is enabled,
  55   with infinite looping in net_bh.
  56
  57   We cannot track such dead loops during route installation,
  58   it is infeasible task. The most general solutions would be
  59   to keep skb->encapsulation counter (sort of local ttl),
  60   and silently drop packet when it expires. It is a good
  61   solution, but it supposes maintaining new variable in ALL
  62   skb, even if no tunneling is used.
  63
  64   Current solution: xmit_recursion breaks dead loops. This is a percpu
  65   counter, since when we enter the first ndo_xmit(), cpu migration is
  66   forbidden. We force an exit if this counter reaches RECURSION_LIMIT
  67
  68   2. Networking dead loops would not kill routers, but would really
  69   kill network. IP hop limit plays role of "t->recursion" in this case,
  70   if we copy it from packet being encapsulated to upper header.
  71   It is very good solution, but it introduces two problems:
  72
  73   - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
  74     do not work over tunnels.
  75   - traceroute does not work. I planned to relay ICMP from tunnel,
  76     so that this problem would be solved and traceroute output
  77     would even more informative. This idea appeared to be wrong:
  78     only Linux complies to rfc1812 now (yes, guys, Linux is the only
  79     true router now :-)), all routers (at least, in neighbourhood of mine)
  80     return only 8 bytes of payload. It is the end.
  81
  82   Hence, if we want that OSPF worked or traceroute said something reasonable,
  83   we should search for another solution.
  84
  85   One of them is to parse packet trying to detect inner encapsulation
  86   made by our node. It is difficult or even impossible, especially,
  87   taking into account fragmentation. TO be short, ttl is not solution at all.
  88
  89   Current solution: The solution was UNEXPECTEDLY SIMPLE.
  90   We force DF flag on tunnels with preconfigured hop limit,
  91   that is ALL. :-) Well, it does not remove the problem completely,
  92   but exponential growth of network traffic is changed to linear
  93   (branches, that exceed pmtu are pruned) and tunnel mtu
  94   rapidly degrades to value <68, where looping stops.
  95   Yes, it is not good if there exists a router in the loop,
  96   which does not force DF, even when encapsulating packets have DF set.
  97   But it is not our problem! Nobody could accuse us, we made
  98   all that we could make. Even if it is your gated who injected
  99   fatal route to network, even if it were you who configured
 100   fatal static route: you are innocent. :-)
 101
 102   Alexey Kuznetsov.
 103 */
 104
 105static bool log_ecn_error = true;
 106module_param(log_ecn_error, bool, 0644);
 107MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 108
 109static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 110static int ipgre_tunnel_init(struct net_device *dev);
 111static void erspan_build_header(struct sk_buff *skb,
 112                                u32 id, u32 index,
 113                                bool truncate, bool is_ipv4);
 114
 115static unsigned int ipgre_net_id __read_mostly;
 116static unsigned int gre_tap_net_id __read_mostly;
 117static unsigned int erspan_net_id __read_mostly;
 118
 119static int ipgre_err(struct sk_buff *skb, u32 info,
 120                     const struct tnl_ptk_info *tpi)
 121{
 122
 123        /* All the routers (except for Linux) return only
 124           8 bytes of packet payload. It means, that precise relaying of
 125           ICMP in the real Internet is absolutely infeasible.
 126
 127           Moreover, Cisco "wise men" put GRE key to the third word
 128           in GRE header. It makes impossible maintaining even soft
 129           state for keyed GRE tunnels with enabled checksum. Tell
 130           them "thank you".
 131
 132           Well, I wonder, rfc1812 was written by Cisco employee,
 133           what the hell these idiots break standards established
 134           by themselves???
 135           */
 136        struct net *net = dev_net(skb->dev);
 137        struct ip_tunnel_net *itn;
 138        const struct iphdr *iph;
 139        const int type = icmp_hdr(skb)->type;
 140        const int code = icmp_hdr(skb)->code;
 141        unsigned int data_len = 0;
 142        struct ip_tunnel *t;
 143
 144        if (tpi->proto == htons(ETH_P_TEB))
 145                itn = net_generic(net, gre_tap_net_id);
 146        else if (tpi->proto == htons(ETH_P_ERSPAN) ||
 147                 tpi->proto == htons(ETH_P_ERSPAN2))
 148                itn = net_generic(net, erspan_net_id);
 149        else
 150                itn = net_generic(net, ipgre_net_id);
 151
 152        iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
 153        t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
 154                             iph->daddr, iph->saddr, tpi->key);
 155
 156        if (!t)
 157                return -ENOENT;
 158
 159        switch (type) {
 160        default:
 161        case ICMP_PARAMETERPROB:
 162                return 0;
 163
 164        case ICMP_DEST_UNREACH:
 165                switch (code) {
 166                case ICMP_SR_FAILED:
 167                case ICMP_PORT_UNREACH:
 168                        /* Impossible event. */
 169                        return 0;
 170                default:
 171                        /* All others are translated to HOST_UNREACH.
 172                           rfc2003 contains "deep thoughts" about NET_UNREACH,
 173                           I believe they are just ether pollution. --ANK
 174                         */
 175                        break;
 176                }
 177                break;
 178
 179        case ICMP_TIME_EXCEEDED:
 180                if (code != ICMP_EXC_TTL)
 181                        return 0;
 182                data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
 183                break;
 184
 185        case ICMP_REDIRECT:
 186                break;
 187        }
 188
 189#if IS_ENABLED(CONFIG_IPV6)
 190       if (tpi->proto == htons(ETH_P_IPV6) &&
 191           !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
 192                                       type, data_len))
 193               return 0;
 194#endif
 195
 196        if (t->parms.iph.daddr == 0 ||
 197            ipv4_is_multicast(t->parms.iph.daddr))
 198                return 0;
 199
 200        if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
 201                return 0;
 202
 203        if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
 204                t->err_count++;
 205        else
 206                t->err_count = 1;
 207        t->err_time = jiffies;
 208
 209        return 0;
 210}
 211
 212static void gre_err(struct sk_buff *skb, u32 info)
 213{
 214        /* All the routers (except for Linux) return only
 215         * 8 bytes of packet payload. It means, that precise relaying of
 216         * ICMP in the real Internet is absolutely infeasible.
 217         *
 218         * Moreover, Cisco "wise men" put GRE key to the third word
 219         * in GRE header. It makes impossible maintaining even soft
 220         * state for keyed
 221         * GRE tunnels with enabled checksum. Tell them "thank you".
 222         *
 223         * Well, I wonder, rfc1812 was written by Cisco employee,
 224         * what the hell these idiots break standards established
 225         * by themselves???
 226         */
 227
 228        const struct iphdr *iph = (struct iphdr *)skb->data;
 229        const int type = icmp_hdr(skb)->type;
 230        const int code = icmp_hdr(skb)->code;
 231        struct tnl_ptk_info tpi;
 232
 233        if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP),
 234                             iph->ihl * 4) < 0)
 235                return;
 236
 237        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
 238                ipv4_update_pmtu(skb, dev_net(skb->dev), info,
 239                                 skb->dev->ifindex, IPPROTO_GRE);
 240                return;
 241        }
 242        if (type == ICMP_REDIRECT) {
 243                ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex,
 244                              IPPROTO_GRE);
 245                return;
 246        }
 247
 248        ipgre_err(skb, info, &tpi);
 249}
 250
 251static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 252                      int gre_hdr_len)
 253{
 254        struct net *net = dev_net(skb->dev);
 255        struct metadata_dst *tun_dst = NULL;
 256        struct erspan_base_hdr *ershdr;
 257        struct ip_tunnel_net *itn;
 258        struct ip_tunnel *tunnel;
 259        const struct iphdr *iph;
 260        struct erspan_md2 *md2;
 261        int ver;
 262        int len;
 263
 264        itn = net_generic(net, erspan_net_id);
 265
 266        iph = ip_hdr(skb);
 267        ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
 268        ver = ershdr->ver;
 269
 270        tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
 271                                  tpi->flags | TUNNEL_KEY,
 272                                  iph->saddr, iph->daddr, tpi->key);
 273
 274        if (tunnel) {
 275                len = gre_hdr_len + erspan_hdr_len(ver);
 276                if (unlikely(!pskb_may_pull(skb, len)))
 277                        return PACKET_REJECT;
 278
 279                if (__iptunnel_pull_header(skb,
 280                                           len,
 281                                           htons(ETH_P_TEB),
 282                                           false, false) < 0)
 283                        goto drop;
 284
 285                if (tunnel->collect_md) {
 286                        struct erspan_metadata *pkt_md, *md;
 287                        struct ip_tunnel_info *info;
 288                        unsigned char *gh;
 289                        __be64 tun_id;
 290                        __be16 flags;
 291
 292                        tpi->flags |= TUNNEL_KEY;
 293                        flags = tpi->flags;
 294                        tun_id = key32_to_tunnel_id(tpi->key);
 295
 296                        tun_dst = ip_tun_rx_dst(skb, flags,
 297                                                tun_id, sizeof(*md));
 298                        if (!tun_dst)
 299                                return PACKET_REJECT;
 300
 301                        /* skb can be uncloned in __iptunnel_pull_header, so
 302                         * old pkt_md is no longer valid and we need to reset
 303                         * it
 304                         */
 305                        gh = skb_network_header(skb) +
 306                             skb_network_header_len(skb);
 307                        pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len +
 308                                                            sizeof(*ershdr));
 309                        md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
 310                        md->version = ver;
 311                        md2 = &md->u.md2;
 312                        memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE :
 313                                                       ERSPAN_V2_MDSIZE);
 314
 315                        info = &tun_dst->u.tun_info;
 316                        info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
 317                        info->options_len = sizeof(*md);
 318                }
 319
 320                skb_reset_mac_header(skb);
 321                ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
 322                return PACKET_RCVD;
 323        }
 324        return PACKET_REJECT;
 325
 326drop:
 327        kfree_skb(skb);
 328        return PACKET_RCVD;
 329}
 330
 331static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 332                       struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
 333{
 334        struct metadata_dst *tun_dst = NULL;
 335        const struct iphdr *iph;
 336        struct ip_tunnel *tunnel;
 337
 338        iph = ip_hdr(skb);
 339        tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
 340                                  iph->saddr, iph->daddr, tpi->key);
 341
 342        if (tunnel) {
 343                if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
 344                                           raw_proto, false) < 0)
 345                        goto drop;
 346
 347                if (tunnel->dev->type != ARPHRD_NONE)
 348                        skb_pop_mac_header(skb);
 349                else
 350                        skb_reset_mac_header(skb);
 351                if (tunnel->collect_md) {
 352                        __be16 flags;
 353                        __be64 tun_id;
 354
 355                        flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
 356                        tun_id = key32_to_tunnel_id(tpi->key);
 357                        tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
 358                        if (!tun_dst)
 359                                return PACKET_REJECT;
 360                }
 361
 362                ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
 363                return PACKET_RCVD;
 364        }
 365        return PACKET_NEXT;
 366
 367drop:
 368        kfree_skb(skb);
 369        return PACKET_RCVD;
 370}
 371
 372static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 373                     int hdr_len)
 374{
 375        struct net *net = dev_net(skb->dev);
 376        struct ip_tunnel_net *itn;
 377        int res;
 378
 379        if (tpi->proto == htons(ETH_P_TEB))
 380                itn = net_generic(net, gre_tap_net_id);
 381        else
 382                itn = net_generic(net, ipgre_net_id);
 383
 384        res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
 385        if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
 386                /* ipgre tunnels in collect metadata mode should receive
 387                 * also ETH_P_TEB traffic.
 388                 */
 389                itn = net_generic(net, ipgre_net_id);
 390                res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
 391        }
 392        return res;
 393}
 394
 395static int gre_rcv(struct sk_buff *skb)
 396{
 397        struct tnl_ptk_info tpi;
 398        bool csum_err = false;
 399        int hdr_len;
 400
 401#ifdef CONFIG_NET_IPGRE_BROADCAST
 402        if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
 403                /* Looped back packet, drop it! */
 404                if (rt_is_output_route(skb_rtable(skb)))
 405                        goto drop;
 406        }
 407#endif
 408
 409        hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
 410        if (hdr_len < 0)
 411                goto drop;
 412
 413        if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
 414                     tpi.proto == htons(ETH_P_ERSPAN2))) {
 415                if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
 416                        return 0;
 417                goto out;
 418        }
 419
 420        if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
 421                return 0;
 422
 423out:
 424        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 425drop:
 426        kfree_skb(skb);
 427        return 0;
 428}
 429
 430static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
 431                       const struct iphdr *tnl_params,
 432                       __be16 proto)
 433{
 434        struct ip_tunnel *tunnel = netdev_priv(dev);
 435
 436        if (tunnel->parms.o_flags & TUNNEL_SEQ)
 437                tunnel->o_seqno++;
 438
 439        /* Push GRE header. */
 440        gre_build_header(skb, tunnel->tun_hlen,
 441                         tunnel->parms.o_flags, proto, tunnel->parms.o_key,
 442                         htonl(tunnel->o_seqno));
 443
 444        ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
 445}
 446
 447static int gre_handle_offloads(struct sk_buff *skb, bool csum)
 448{
 449        return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
 450}
 451
 452static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 453                        __be16 proto)
 454{
 455        struct ip_tunnel *tunnel = netdev_priv(dev);
 456        struct ip_tunnel_info *tun_info;
 457        const struct ip_tunnel_key *key;
 458        int tunnel_hlen;
 459        __be16 flags;
 460
 461        tun_info = skb_tunnel_info(skb);
 462        if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
 463                     ip_tunnel_info_af(tun_info) != AF_INET))
 464                goto err_free_skb;
 465
 466        key = &tun_info->key;
 467        tunnel_hlen = gre_calc_hlen(key->tun_flags);
 468
 469        if (skb_cow_head(skb, dev->needed_headroom))
 470                goto err_free_skb;
 471
 472        /* Push Tunnel header. */
 473        if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
 474                goto err_free_skb;
 475
 476        flags = tun_info->key.tun_flags &
 477                (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
 478        gre_build_header(skb, tunnel_hlen, flags, proto,
 479                         tunnel_id_to_key32(tun_info->key.tun_id),
 480                         (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0);
 481
 482        ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
 483
 484        return;
 485
 486err_free_skb:
 487        kfree_skb(skb);
 488        dev->stats.tx_dropped++;
 489}
 490
 491static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 492{
 493        struct ip_tunnel *tunnel = netdev_priv(dev);
 494        struct ip_tunnel_info *tun_info;
 495        const struct ip_tunnel_key *key;
 496        struct erspan_metadata *md;
 497        bool truncate = false;
 498        __be16 proto;
 499        int tunnel_hlen;
 500        int version;
 501        int nhoff;
 502        int thoff;
 503
 504        tun_info = skb_tunnel_info(skb);
 505        if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
 506                     ip_tunnel_info_af(tun_info) != AF_INET))
 507                goto err_free_skb;
 508
 509        key = &tun_info->key;
 510        if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
 511                goto err_free_skb;
 512        md = ip_tunnel_info_opts(tun_info);
 513        if (!md)
 514                goto err_free_skb;
 515
 516        /* ERSPAN has fixed 8 byte GRE header */
 517        version = md->version;
 518        tunnel_hlen = 8 + erspan_hdr_len(version);
 519
 520        if (skb_cow_head(skb, dev->needed_headroom))
 521                goto err_free_skb;
 522
 523        if (gre_handle_offloads(skb, false))
 524                goto err_free_skb;
 525
 526        if (skb->len > dev->mtu + dev->hard_header_len) {
 527                pskb_trim(skb, dev->mtu + dev->hard_header_len);
 528                truncate = true;
 529        }
 530
 531        nhoff = skb_network_header(skb) - skb_mac_header(skb);
 532        if (skb->protocol == htons(ETH_P_IP) &&
 533            (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
 534                truncate = true;
 535
 536        thoff = skb_transport_header(skb) - skb_mac_header(skb);
 537        if (skb->protocol == htons(ETH_P_IPV6) &&
 538            (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff))
 539                truncate = true;
 540
 541        if (version == 1) {
 542                erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
 543                                    ntohl(md->u.index), truncate, true);
 544                proto = htons(ETH_P_ERSPAN);
 545        } else if (version == 2) {
 546                erspan_build_header_v2(skb,
 547                                       ntohl(tunnel_id_to_key32(key->tun_id)),
 548                                       md->u.md2.dir,
 549                                       get_hwid(&md->u.md2),
 550                                       truncate, true);
 551                proto = htons(ETH_P_ERSPAN2);
 552        } else {
 553                goto err_free_skb;
 554        }
 555
 556        gre_build_header(skb, 8, TUNNEL_SEQ,
 557                         proto, 0, htonl(tunnel->o_seqno++));
 558
 559        ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
 560
 561        return;
 562
 563err_free_skb:
 564        kfree_skb(skb);
 565        dev->stats.tx_dropped++;
 566}
 567
 568static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 569{
 570        struct ip_tunnel_info *info = skb_tunnel_info(skb);
 571        const struct ip_tunnel_key *key;
 572        struct rtable *rt;
 573        struct flowi4 fl4;
 574
 575        if (ip_tunnel_info_af(info) != AF_INET)
 576                return -EINVAL;
 577
 578        key = &info->key;
 579        ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src,
 580                            tunnel_id_to_key32(key->tun_id), key->tos, 0,
 581                            skb->mark, skb_get_hash(skb));
 582        rt = ip_route_output_key(dev_net(dev), &fl4);
 583        if (IS_ERR(rt))
 584                return PTR_ERR(rt);
 585
 586        ip_rt_put(rt);
 587        info->key.u.ipv4.src = fl4.saddr;
 588        return 0;
 589}
 590
 591static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
 592                              struct net_device *dev)
 593{
 594        struct ip_tunnel *tunnel = netdev_priv(dev);
 595        const struct iphdr *tnl_params;
 596
 597        if (!pskb_inet_may_pull(skb))
 598                goto free_skb;
 599
 600        if (tunnel->collect_md) {
 601                gre_fb_xmit(skb, dev, skb->protocol);
 602                return NETDEV_TX_OK;
 603        }
 604
 605        if (dev->header_ops) {
 606                /* Need space for new headers */
 607                if (skb_cow_head(skb, dev->needed_headroom -
 608                                      (tunnel->hlen + sizeof(struct iphdr))))
 609                        goto free_skb;
 610
 611                tnl_params = (const struct iphdr *)skb->data;
 612
 613                /* Pull skb since ip_tunnel_xmit() needs skb->data pointing
 614                 * to gre header.
 615                 */
 616                skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
 617                skb_reset_mac_header(skb);
 618        } else {
 619                if (skb_cow_head(skb, dev->needed_headroom))
 620                        goto free_skb;
 621
 622                tnl_params = &tunnel->parms.iph;
 623        }
 624
 625        if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
 626                goto free_skb;
 627
 628        __gre_xmit(skb, dev, tnl_params, skb->protocol);
 629        return NETDEV_TX_OK;
 630
 631free_skb:
 632        kfree_skb(skb);
 633        dev->stats.tx_dropped++;
 634        return NETDEV_TX_OK;
 635}
 636
 637static netdev_tx_t erspan_xmit(struct sk_buff *skb,
 638                               struct net_device *dev)
 639{
 640        struct ip_tunnel *tunnel = netdev_priv(dev);
 641        bool truncate = false;
 642        __be16 proto;
 643
 644        if (!pskb_inet_may_pull(skb))
 645                goto free_skb;
 646
 647        if (tunnel->collect_md) {
 648                erspan_fb_xmit(skb, dev);
 649                return NETDEV_TX_OK;
 650        }
 651
 652        if (gre_handle_offloads(skb, false))
 653                goto free_skb;
 654
 655        if (skb_cow_head(skb, dev->needed_headroom))
 656                goto free_skb;
 657
 658        if (skb->len > dev->mtu + dev->hard_header_len) {
 659                pskb_trim(skb, dev->mtu + dev->hard_header_len);
 660                truncate = true;
 661        }
 662
 663        /* Push ERSPAN header */
 664        if (tunnel->erspan_ver == 1) {
 665                erspan_build_header(skb, ntohl(tunnel->parms.o_key),
 666                                    tunnel->index,
 667                                    truncate, true);
 668                proto = htons(ETH_P_ERSPAN);
 669        } else if (tunnel->erspan_ver == 2) {
 670                erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key),
 671                                       tunnel->dir, tunnel->hwid,
 672                                       truncate, true);
 673                proto = htons(ETH_P_ERSPAN2);
 674        } else {
 675                goto free_skb;
 676        }
 677
 678        tunnel->parms.o_flags &= ~TUNNEL_KEY;
 679        __gre_xmit(skb, dev, &tunnel->parms.iph, proto);
 680        return NETDEV_TX_OK;
 681
 682free_skb:
 683        kfree_skb(skb);
 684        dev->stats.tx_dropped++;
 685        return NETDEV_TX_OK;
 686}
 687
 688static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
 689                                struct net_device *dev)
 690{
 691        struct ip_tunnel *tunnel = netdev_priv(dev);
 692
 693        if (!pskb_inet_may_pull(skb))
 694                goto free_skb;
 695
 696        if (tunnel->collect_md) {
 697                gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
 698                return NETDEV_TX_OK;
 699        }
 700
 701        if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
 702                goto free_skb;
 703
 704        if (skb_cow_head(skb, dev->needed_headroom))
 705                goto free_skb;
 706
 707        __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
 708        return NETDEV_TX_OK;
 709
 710free_skb:
 711        kfree_skb(skb);
 712        dev->stats.tx_dropped++;
 713        return NETDEV_TX_OK;
 714}
 715
 716static void ipgre_link_update(struct net_device *dev, bool set_mtu)
 717{
 718        struct ip_tunnel *tunnel = netdev_priv(dev);
 719        int len;
 720
 721        len = tunnel->tun_hlen;
 722        tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
 723        len = tunnel->tun_hlen - len;
 724        tunnel->hlen = tunnel->hlen + len;
 725
 726        dev->needed_headroom = dev->needed_headroom + len;
 727        if (set_mtu)
 728                dev->mtu = max_t(int, dev->mtu - len, 68);
 729
 730        if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
 731                if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
 732                    tunnel->encap.type == TUNNEL_ENCAP_NONE) {
 733                        dev->features |= NETIF_F_GSO_SOFTWARE;
 734                        dev->hw_features |= NETIF_F_GSO_SOFTWARE;
 735                } else {
 736                        dev->features &= ~NETIF_F_GSO_SOFTWARE;
 737                        dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
 738                }
 739                dev->features |= NETIF_F_LLTX;
 740        } else {
 741                dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
 742                dev->features &= ~(NETIF_F_LLTX | NETIF_F_GSO_SOFTWARE);
 743        }
 744}
 745
 746static int ipgre_tunnel_ioctl(struct net_device *dev,
 747                              struct ifreq *ifr, int cmd)
 748{
 749        struct ip_tunnel_parm p;
 750        int err;
 751
 752        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
 753                return -EFAULT;
 754
 755        if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
 756                if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
 757                    p.iph.ihl != 5 || (p.iph.frag_off & htons(~IP_DF)) ||
 758                    ((p.i_flags | p.o_flags) & (GRE_VERSION | GRE_ROUTING)))
 759                        return -EINVAL;
 760        }
 761
 762        p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
 763        p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
 764
 765        err = ip_tunnel_ioctl(dev, &p, cmd);
 766        if (err)
 767                return err;
 768
 769        if (cmd == SIOCCHGTUNNEL) {
 770                struct ip_tunnel *t = netdev_priv(dev);
 771
 772                t->parms.i_flags = p.i_flags;
 773                t->parms.o_flags = p.o_flags;
 774
 775                if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
 776                        ipgre_link_update(dev, true);
 777        }
 778
 779        p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags);
 780        p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags);
 781
 782        if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
 783                return -EFAULT;
 784
 785        return 0;
 786}
 787
 788/* Nice toy. Unfortunately, useless in real life :-)
 789   It allows to construct virtual multiprotocol broadcast "LAN"
 790   over the Internet, provided multicast routing is tuned.
 791
 792
 793   I have no idea was this bicycle invented before me,
 794   so that I had to set ARPHRD_IPGRE to a random value.
 795   I have an impression, that Cisco could make something similar,
 796   but this feature is apparently missing in IOS<=11.2(8).
 797
 798   I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
 799   with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
 800
 801   ping -t 255 224.66.66.66
 802
 803   If nobody answers, mbone does not work.
 804
 805   ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
 806   ip addr add 10.66.66.<somewhat>/24 dev Universe
 807   ifconfig Universe up
 808   ifconfig Universe add fe80::<Your_real_addr>/10
 809   ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
 810   ftp 10.66.66.66
 811   ...
 812   ftp fec0:6666:6666::193.233.7.65
 813   ...
 814 */
 815static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
 816                        unsigned short type,
 817                        const void *daddr, const void *saddr, unsigned int len)
 818{
 819        struct ip_tunnel *t = netdev_priv(dev);
 820        struct iphdr *iph;
 821        struct gre_base_hdr *greh;
 822
 823        iph = skb_push(skb, t->hlen + sizeof(*iph));
 824        greh = (struct gre_base_hdr *)(iph+1);
 825        greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
 826        greh->protocol = htons(type);
 827
 828        memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
 829
 830        /* Set the source hardware address. */
 831        if (saddr)
 832                memcpy(&iph->saddr, saddr, 4);
 833        if (daddr)
 834                memcpy(&iph->daddr, daddr, 4);
 835        if (iph->daddr)
 836                return t->hlen + sizeof(*iph);
 837
 838        return -(t->hlen + sizeof(*iph));
 839}
 840
 841static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
 842{
 843        const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
 844        memcpy(haddr, &iph->saddr, 4);
 845        return 4;
 846}
 847
 848static const struct header_ops ipgre_header_ops = {
 849        .create = ipgre_header,
 850        .parse  = ipgre_header_parse,
 851};
 852
 853#ifdef CONFIG_NET_IPGRE_BROADCAST
 854static int ipgre_open(struct net_device *dev)
 855{
 856        struct ip_tunnel *t = netdev_priv(dev);
 857
 858        if (ipv4_is_multicast(t->parms.iph.daddr)) {
 859                struct flowi4 fl4;
 860                struct rtable *rt;
 861
 862                rt = ip_route_output_gre(t->net, &fl4,
 863                                         t->parms.iph.daddr,
 864                                         t->parms.iph.saddr,
 865                                         t->parms.o_key,
 866                                         RT_TOS(t->parms.iph.tos),
 867                                         t->parms.link);
 868                if (IS_ERR(rt))
 869                        return -EADDRNOTAVAIL;
 870                dev = rt->dst.dev;
 871                ip_rt_put(rt);
 872                if (!__in_dev_get_rtnl(dev))
 873                        return -EADDRNOTAVAIL;
 874                t->mlink = dev->ifindex;
 875                ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
 876        }
 877        return 0;
 878}
 879
 880static int ipgre_close(struct net_device *dev)
 881{
 882        struct ip_tunnel *t = netdev_priv(dev);
 883
 884        if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
 885                struct in_device *in_dev;
 886                in_dev = inetdev_by_index(t->net, t->mlink);
 887                if (in_dev)
 888                        ip_mc_dec_group(in_dev, t->parms.iph.daddr);
 889        }
 890        return 0;
 891}
 892#endif
 893
 894static const struct net_device_ops ipgre_netdev_ops = {
 895        .ndo_init               = ipgre_tunnel_init,
 896        .ndo_uninit             = ip_tunnel_uninit,
 897#ifdef CONFIG_NET_IPGRE_BROADCAST
 898        .ndo_open               = ipgre_open,
 899        .ndo_stop               = ipgre_close,
 900#endif
 901        .ndo_start_xmit         = ipgre_xmit,
 902        .ndo_do_ioctl           = ipgre_tunnel_ioctl,
 903        .ndo_change_mtu         = ip_tunnel_change_mtu,
 904        .ndo_get_stats64        = ip_tunnel_get_stats64,
 905        .ndo_get_iflink         = ip_tunnel_get_iflink,
 906};
 907
 908#define GRE_FEATURES (NETIF_F_SG |              \
 909                      NETIF_F_FRAGLIST |        \
 910                      NETIF_F_HIGHDMA |         \
 911                      NETIF_F_HW_CSUM)
 912
 913static void ipgre_tunnel_setup(struct net_device *dev)
 914{
 915        dev->netdev_ops         = &ipgre_netdev_ops;
 916        dev->type               = ARPHRD_IPGRE;
 917        ip_tunnel_setup(dev, ipgre_net_id);
 918}
 919
 920static void __gre_tunnel_init(struct net_device *dev)
 921{
 922        struct ip_tunnel *tunnel;
 923
 924        tunnel = netdev_priv(dev);
 925        tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
 926        tunnel->parms.iph.protocol = IPPROTO_GRE;
 927
 928        tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
 929
 930        dev->features           |= GRE_FEATURES;
 931        dev->hw_features        |= GRE_FEATURES;
 932
 933        if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
 934                /* TCP offload with GRE SEQ is not supported, nor
 935                 * can we support 2 levels of outer headers requiring
 936                 * an update.
 937                 */
 938                if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
 939                    (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
 940                        dev->features    |= NETIF_F_GSO_SOFTWARE;
 941                        dev->hw_features |= NETIF_F_GSO_SOFTWARE;
 942                }
 943
 944                /* Can use a lockless transmit, unless we generate
 945                 * output sequences
 946                 */
 947                dev->features |= NETIF_F_LLTX;
 948        }
 949}
 950
 951static int ipgre_tunnel_init(struct net_device *dev)
 952{
 953        struct ip_tunnel *tunnel = netdev_priv(dev);
 954        struct iphdr *iph = &tunnel->parms.iph;
 955
 956        __gre_tunnel_init(dev);
 957
 958        memcpy(dev->dev_addr, &iph->saddr, 4);
 959        memcpy(dev->broadcast, &iph->daddr, 4);
 960
 961        dev->flags              = IFF_NOARP;
 962        netif_keep_dst(dev);
 963        dev->addr_len           = 4;
 964
 965        if (iph->daddr && !tunnel->collect_md) {
 966#ifdef CONFIG_NET_IPGRE_BROADCAST
 967                if (ipv4_is_multicast(iph->daddr)) {
 968                        if (!iph->saddr)
 969                                return -EINVAL;
 970                        dev->flags = IFF_BROADCAST;
 971                        dev->header_ops = &ipgre_header_ops;
 972                }
 973#endif
 974        } else if (!tunnel->collect_md) {
 975                dev->header_ops = &ipgre_header_ops;
 976        }
 977
 978        return ip_tunnel_init(dev);
 979}
 980
 981static const struct gre_protocol ipgre_protocol = {
 982        .handler     = gre_rcv,
 983        .err_handler = gre_err,
 984};
 985
 986static int __net_init ipgre_init_net(struct net *net)
 987{
 988        return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
 989}
 990
 991static void __net_exit ipgre_exit_batch_net(struct list_head *list_net)
 992{
 993        ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops);
 994}
 995
 996static struct pernet_operations ipgre_net_ops = {
 997        .init = ipgre_init_net,
 998        .exit_batch = ipgre_exit_batch_net,
 999        .id   = &ipgre_net_id,
1000        .size = sizeof(struct ip_tunnel_net),
1001};
1002
1003static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
1004                                 struct netlink_ext_ack *extack)
1005{
1006        __be16 flags;
1007
1008        if (!data)
1009                return 0;
1010
1011        flags = 0;
1012        if (data[IFLA_GRE_IFLAGS])
1013                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1014        if (data[IFLA_GRE_OFLAGS])
1015                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1016        if (flags & (GRE_VERSION|GRE_ROUTING))
1017                return -EINVAL;
1018
1019        if (data[IFLA_GRE_COLLECT_METADATA] &&
1020            data[IFLA_GRE_ENCAP_TYPE] &&
1021            nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE)
1022                return -EINVAL;
1023
1024        return 0;
1025}
1026
1027static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
1028                              struct netlink_ext_ack *extack)
1029{
1030        __be32 daddr;
1031
1032        if (tb[IFLA_ADDRESS]) {
1033                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1034                        return -EINVAL;
1035                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1036                        return -EADDRNOTAVAIL;
1037        }
1038
1039        if (!data)
1040                goto out;
1041
1042        if (data[IFLA_GRE_REMOTE]) {
1043                memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1044                if (!daddr)
1045                        return -EINVAL;
1046        }
1047
1048out:
1049        return ipgre_tunnel_validate(tb, data, extack);
1050}
1051
1052static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
1053                           struct netlink_ext_ack *extack)
1054{
1055        __be16 flags = 0;
1056        int ret;
1057
1058        if (!data)
1059                return 0;
1060
1061        ret = ipgre_tap_validate(tb, data, extack);
1062        if (ret)
1063                return ret;
1064
1065        /* ERSPAN should only have GRE sequence and key flag */
1066        if (data[IFLA_GRE_OFLAGS])
1067                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1068        if (data[IFLA_GRE_IFLAGS])
1069                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1070        if (!data[IFLA_GRE_COLLECT_METADATA] &&
1071            flags != (GRE_SEQ | GRE_KEY))
1072                return -EINVAL;
1073
1074        /* ERSPAN Session ID only has 10-bit. Since we reuse
1075         * 32-bit key field as ID, check it's range.
1076         */
1077        if (data[IFLA_GRE_IKEY] &&
1078            (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
1079                return -EINVAL;
1080
1081        if (data[IFLA_GRE_OKEY] &&
1082            (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
1083                return -EINVAL;
1084
1085        return 0;
1086}
1087
1088static int ipgre_netlink_parms(struct net_device *dev,
1089                                struct nlattr *data[],
1090                                struct nlattr *tb[],
1091                                struct ip_tunnel_parm *parms,
1092                                __u32 *fwmark)
1093{
1094        struct ip_tunnel *t = netdev_priv(dev);
1095
1096        memset(parms, 0, sizeof(*parms));
1097
1098        parms->iph.protocol = IPPROTO_GRE;
1099
1100        if (!data)
1101                return 0;
1102
1103        if (data[IFLA_GRE_LINK])
1104                parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1105
1106        if (data[IFLA_GRE_IFLAGS])
1107                parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
1108
1109        if (data[IFLA_GRE_OFLAGS])
1110                parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
1111
1112        if (data[IFLA_GRE_IKEY])
1113                parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1114
1115        if (data[IFLA_GRE_OKEY])
1116                parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1117
1118        if (data[IFLA_GRE_LOCAL])
1119                parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
1120
1121        if (data[IFLA_GRE_REMOTE])
1122                parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
1123
1124        if (data[IFLA_GRE_TTL])
1125                parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1126
1127        if (data[IFLA_GRE_TOS])
1128                parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1129
1130        if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) {
1131                if (t->ignore_df)
1132                        return -EINVAL;
1133                parms->iph.frag_off = htons(IP_DF);
1134        }
1135
1136        if (data[IFLA_GRE_COLLECT_METADATA]) {
1137                t->collect_md = true;
1138                if (dev->type == ARPHRD_IPGRE)
1139                        dev->type = ARPHRD_NONE;
1140        }
1141
1142        if (data[IFLA_GRE_IGNORE_DF]) {
1143                if (nla_get_u8(data[IFLA_GRE_IGNORE_DF])
1144                  && (parms->iph.frag_off & htons(IP_DF)))
1145                        return -EINVAL;
1146                t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
1147        }
1148
1149        if (data[IFLA_GRE_FWMARK])
1150                *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
1151
1152        if (data[IFLA_GRE_ERSPAN_VER]) {
1153                t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
1154
1155                if (t->erspan_ver != 1 && t->erspan_ver != 2)
1156                        return -EINVAL;
1157        }
1158
1159        if (t->erspan_ver == 1) {
1160                if (data[IFLA_GRE_ERSPAN_INDEX]) {
1161                        t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
1162                        if (t->index & ~INDEX_MASK)
1163                                return -EINVAL;
1164                }
1165        } else if (t->erspan_ver == 2) {
1166                if (data[IFLA_GRE_ERSPAN_DIR]) {
1167                        t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
1168                        if (t->dir & ~(DIR_MASK >> DIR_OFFSET))
1169                                return -EINVAL;
1170                }
1171                if (data[IFLA_GRE_ERSPAN_HWID]) {
1172                        t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
1173                        if (t->hwid & ~(HWID_MASK >> HWID_OFFSET))
1174                                return -EINVAL;
1175                }
1176        }
1177
1178        return 0;
1179}
1180
1181/* This function returns true when ENCAP attributes are present in the nl msg */
1182static bool ipgre_netlink_encap_parms(struct nlattr *data[],
1183                                      struct ip_tunnel_encap *ipencap)
1184{
1185        bool ret = false;
1186
1187        memset(ipencap, 0, sizeof(*ipencap));
1188
1189        if (!data)
1190                return ret;
1191
1192        if (data[IFLA_GRE_ENCAP_TYPE]) {
1193                ret = true;
1194                ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
1195        }
1196
1197        if (data[IFLA_GRE_ENCAP_FLAGS]) {
1198                ret = true;
1199                ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
1200        }
1201
1202        if (data[IFLA_GRE_ENCAP_SPORT]) {
1203                ret = true;
1204                ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
1205        }
1206
1207        if (data[IFLA_GRE_ENCAP_DPORT]) {
1208                ret = true;
1209                ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
1210        }
1211
1212        return ret;
1213}
1214
1215static int gre_tap_init(struct net_device *dev)
1216{
1217        __gre_tunnel_init(dev);
1218        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1219        netif_keep_dst(dev);
1220
1221        return ip_tunnel_init(dev);
1222}
1223
1224static const struct net_device_ops gre_tap_netdev_ops = {
1225        .ndo_init               = gre_tap_init,
1226        .ndo_uninit             = ip_tunnel_uninit,
1227        .ndo_start_xmit         = gre_tap_xmit,
1228        .ndo_set_mac_address    = eth_mac_addr,
1229        .ndo_validate_addr      = eth_validate_addr,
1230        .ndo_change_mtu         = ip_tunnel_change_mtu,
1231        .ndo_get_stats64        = ip_tunnel_get_stats64,
1232        .ndo_get_iflink         = ip_tunnel_get_iflink,
1233        .ndo_fill_metadata_dst  = gre_fill_metadata_dst,
1234};
1235
1236static int erspan_tunnel_init(struct net_device *dev)
1237{
1238        struct ip_tunnel *tunnel = netdev_priv(dev);
1239
1240        tunnel->tun_hlen = 8;
1241        tunnel->parms.iph.protocol = IPPROTO_GRE;
1242        tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
1243                       erspan_hdr_len(tunnel->erspan_ver);
1244
1245        dev->features           |= GRE_FEATURES;
1246        dev->hw_features        |= GRE_FEATURES;
1247        dev->priv_flags         |= IFF_LIVE_ADDR_CHANGE;
1248        netif_keep_dst(dev);
1249
1250        return ip_tunnel_init(dev);
1251}
1252
1253static const struct net_device_ops erspan_netdev_ops = {
1254        .ndo_init               = erspan_tunnel_init,
1255        .ndo_uninit             = ip_tunnel_uninit,
1256        .ndo_start_xmit         = erspan_xmit,
1257        .ndo_set_mac_address    = eth_mac_addr,
1258        .ndo_validate_addr      = eth_validate_addr,
1259        .ndo_change_mtu         = ip_tunnel_change_mtu,
1260        .ndo_get_stats64        = ip_tunnel_get_stats64,
1261        .ndo_get_iflink         = ip_tunnel_get_iflink,
1262        .ndo_fill_metadata_dst  = gre_fill_metadata_dst,
1263};
1264
1265static void ipgre_tap_setup(struct net_device *dev)
1266{
1267        ether_setup(dev);
1268        dev->max_mtu = 0;
1269        dev->netdev_ops = &gre_tap_netdev_ops;
1270        dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1271        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1272        ip_tunnel_setup(dev, gre_tap_net_id);
1273}
1274
1275static int ipgre_newlink(struct net *src_net, struct net_device *dev,
1276                         struct nlattr *tb[], struct nlattr *data[],
1277                         struct netlink_ext_ack *extack)
1278{
1279        struct ip_tunnel_parm p;
1280        struct ip_tunnel_encap ipencap;
1281        __u32 fwmark = 0;
1282        int err;
1283
1284        if (ipgre_netlink_encap_parms(data, &ipencap)) {
1285                struct ip_tunnel *t = netdev_priv(dev);
1286                err = ip_tunnel_encap_setup(t, &ipencap);
1287
1288                if (err < 0)
1289                        return err;
1290        }
1291
1292        err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1293        if (err < 0)
1294                return err;
1295        return ip_tunnel_newlink(dev, tb, &p, fwmark);
1296}
1297
1298static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1299                            struct nlattr *data[],
1300                            struct netlink_ext_ack *extack)
1301{
1302        struct ip_tunnel *t = netdev_priv(dev);
1303        struct ip_tunnel_encap ipencap;
1304        __u32 fwmark = t->fwmark;
1305        struct ip_tunnel_parm p;
1306        int err;
1307
1308        if (ipgre_netlink_encap_parms(data, &ipencap)) {
1309                err = ip_tunnel_encap_setup(t, &ipencap);
1310
1311                if (err < 0)
1312                        return err;
1313        }
1314
1315        err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1316        if (err < 0)
1317                return err;
1318
1319        err = ip_tunnel_changelink(dev, tb, &p, fwmark);
1320        if (err < 0)
1321                return err;
1322
1323        t->parms.i_flags = p.i_flags;
1324        t->parms.o_flags = p.o_flags;
1325
1326        if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
1327                ipgre_link_update(dev, !tb[IFLA_MTU]);
1328
1329        return 0;
1330}
1331
1332static size_t ipgre_get_size(const struct net_device *dev)
1333{
1334        return
1335                /* IFLA_GRE_LINK */
1336                nla_total_size(4) +
1337                /* IFLA_GRE_IFLAGS */
1338                nla_total_size(2) +
1339                /* IFLA_GRE_OFLAGS */
1340                nla_total_size(2) +
1341                /* IFLA_GRE_IKEY */
1342                nla_total_size(4) +
1343                /* IFLA_GRE_OKEY */
1344                nla_total_size(4) +
1345                /* IFLA_GRE_LOCAL */
1346                nla_total_size(4) +
1347                /* IFLA_GRE_REMOTE */
1348                nla_total_size(4) +
1349                /* IFLA_GRE_TTL */
1350                nla_total_size(1) +
1351                /* IFLA_GRE_TOS */
1352                nla_total_size(1) +
1353                /* IFLA_GRE_PMTUDISC */
1354                nla_total_size(1) +
1355                /* IFLA_GRE_ENCAP_TYPE */
1356                nla_total_size(2) +
1357                /* IFLA_GRE_ENCAP_FLAGS */
1358                nla_total_size(2) +
1359                /* IFLA_GRE_ENCAP_SPORT */
1360                nla_total_size(2) +
1361                /* IFLA_GRE_ENCAP_DPORT */
1362                nla_total_size(2) +
1363                /* IFLA_GRE_COLLECT_METADATA */
1364                nla_total_size(0) +
1365                /* IFLA_GRE_IGNORE_DF */
1366                nla_total_size(1) +
1367                /* IFLA_GRE_FWMARK */
1368                nla_total_size(4) +
1369                /* IFLA_GRE_ERSPAN_INDEX */
1370                nla_total_size(4) +
1371                /* IFLA_GRE_ERSPAN_VER */
1372                nla_total_size(1) +
1373                /* IFLA_GRE_ERSPAN_DIR */
1374                nla_total_size(1) +
1375                /* IFLA_GRE_ERSPAN_HWID */
1376                nla_total_size(2) +
1377                0;
1378}
1379
1380static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1381{
1382        struct ip_tunnel *t = netdev_priv(dev);
1383        struct ip_tunnel_parm *p = &t->parms;
1384        __be16 o_flags = p->o_flags;
1385
1386        if (t->erspan_ver == 1 || t->erspan_ver == 2) {
1387                if (!t->collect_md)
1388                        o_flags |= TUNNEL_KEY;
1389
1390                if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
1391                        goto nla_put_failure;
1392
1393                if (t->erspan_ver == 1) {
1394                        if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
1395                                goto nla_put_failure;
1396                } else {
1397                        if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
1398                                goto nla_put_failure;
1399                        if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
1400                                goto nla_put_failure;
1401                }
1402        }
1403
1404        if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1405            nla_put_be16(skb, IFLA_GRE_IFLAGS,
1406                         gre_tnl_flags_to_gre_flags(p->i_flags)) ||
1407            nla_put_be16(skb, IFLA_GRE_OFLAGS,
1408                         gre_tnl_flags_to_gre_flags(o_flags)) ||
1409            nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1410            nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1411            nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1412            nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1413            nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1414            nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1415            nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1416                       !!(p->iph.frag_off & htons(IP_DF))) ||
1417            nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark))
1418                goto nla_put_failure;
1419
1420        if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
1421                        t->encap.type) ||
1422            nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
1423                         t->encap.sport) ||
1424            nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
1425                         t->encap.dport) ||
1426            nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
1427                        t->encap.flags))
1428                goto nla_put_failure;
1429
1430        if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df))
1431                goto nla_put_failure;
1432
1433        if (t->collect_md) {
1434                if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
1435                        goto nla_put_failure;
1436        }
1437
1438        return 0;
1439
1440nla_put_failure:
1441        return -EMSGSIZE;
1442}
1443
1444static void erspan_setup(struct net_device *dev)
1445{
1446        struct ip_tunnel *t = netdev_priv(dev);
1447
1448        ether_setup(dev);
1449        dev->netdev_ops = &erspan_netdev_ops;
1450        dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1451        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1452        ip_tunnel_setup(dev, erspan_net_id);
1453        t->erspan_ver = 1;
1454}
1455
1456static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1457        [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1458        [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1459        [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1460        [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1461        [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1462        [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1463        [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1464        [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1465        [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1466        [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1467        [IFLA_GRE_ENCAP_TYPE]   = { .type = NLA_U16 },
1468        [IFLA_GRE_ENCAP_FLAGS]  = { .type = NLA_U16 },
1469        [IFLA_GRE_ENCAP_SPORT]  = { .type = NLA_U16 },
1470        [IFLA_GRE_ENCAP_DPORT]  = { .type = NLA_U16 },
1471        [IFLA_GRE_COLLECT_METADATA]     = { .type = NLA_FLAG },
1472        [IFLA_GRE_IGNORE_DF]    = { .type = NLA_U8 },
1473        [IFLA_GRE_FWMARK]       = { .type = NLA_U32 },
1474        [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 },
1475        [IFLA_GRE_ERSPAN_VER]   = { .type = NLA_U8 },
1476        [IFLA_GRE_ERSPAN_DIR]   = { .type = NLA_U8 },
1477        [IFLA_GRE_ERSPAN_HWID]  = { .type = NLA_U16 },
1478};
1479
1480static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1481        .kind           = "gre",
1482        .maxtype        = IFLA_GRE_MAX,
1483        .policy         = ipgre_policy,
1484        .priv_size      = sizeof(struct ip_tunnel),
1485        .setup          = ipgre_tunnel_setup,
1486        .validate       = ipgre_tunnel_validate,
1487        .newlink        = ipgre_newlink,
1488        .changelink     = ipgre_changelink,
1489        .dellink        = ip_tunnel_dellink,
1490        .get_size       = ipgre_get_size,
1491        .fill_info      = ipgre_fill_info,
1492        .get_link_net   = ip_tunnel_get_link_net,
1493};
1494
1495static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1496        .kind           = "gretap",
1497        .maxtype        = IFLA_GRE_MAX,
1498        .policy         = ipgre_policy,
1499        .priv_size      = sizeof(struct ip_tunnel),
1500        .setup          = ipgre_tap_setup,
1501        .validate       = ipgre_tap_validate,
1502        .newlink        = ipgre_newlink,
1503        .changelink     = ipgre_changelink,
1504        .dellink        = ip_tunnel_dellink,
1505        .get_size       = ipgre_get_size,
1506        .fill_info      = ipgre_fill_info,
1507        .get_link_net   = ip_tunnel_get_link_net,
1508};
1509
1510static struct rtnl_link_ops erspan_link_ops __read_mostly = {
1511        .kind           = "erspan",
1512        .maxtype        = IFLA_GRE_MAX,
1513        .policy         = ipgre_policy,
1514        .priv_size      = sizeof(struct ip_tunnel),
1515        .setup          = erspan_setup,
1516        .validate       = erspan_validate,
1517        .newlink        = ipgre_newlink,
1518        .changelink     = ipgre_changelink,
1519        .dellink        = ip_tunnel_dellink,
1520        .get_size       = ipgre_get_size,
1521        .fill_info      = ipgre_fill_info,
1522        .get_link_net   = ip_tunnel_get_link_net,
1523};
1524
1525struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
1526                                        u8 name_assign_type)
1527{
1528        struct nlattr *tb[IFLA_MAX + 1];
1529        struct net_device *dev;
1530        LIST_HEAD(list_kill);
1531        struct ip_tunnel *t;
1532        int err;
1533
1534        memset(&tb, 0, sizeof(tb));
1535
1536        dev = rtnl_create_link(net, name, name_assign_type,
1537                               &ipgre_tap_ops, tb, NULL);
1538        if (IS_ERR(dev))
1539                return dev;
1540
1541        /* Configure flow based GRE device. */
1542        t = netdev_priv(dev);
1543        t->collect_md = true;
1544
1545        err = ipgre_newlink(net, dev, tb, NULL, NULL);
1546        if (err < 0) {
1547                free_netdev(dev);
1548                return ERR_PTR(err);
1549        }
1550
1551        /* openvswitch users expect packet sizes to be unrestricted,
1552         * so set the largest MTU we can.
1553         */
1554        err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
1555        if (err)
1556                goto out;
1557
1558        err = rtnl_configure_link(dev, NULL);
1559        if (err < 0)
1560                goto out;
1561
1562        return dev;
1563out:
1564        ip_tunnel_dellink(dev, &list_kill);
1565        unregister_netdevice_many(&list_kill);
1566        return ERR_PTR(err);
1567}
1568EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
1569
1570static int __net_init ipgre_tap_init_net(struct net *net)
1571{
1572        return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
1573}
1574
1575static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net)
1576{
1577        ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops);
1578}
1579
1580static struct pernet_operations ipgre_tap_net_ops = {
1581        .init = ipgre_tap_init_net,
1582        .exit_batch = ipgre_tap_exit_batch_net,
1583        .id   = &gre_tap_net_id,
1584        .size = sizeof(struct ip_tunnel_net),
1585};
1586
1587static int __net_init erspan_init_net(struct net *net)
1588{
1589        return ip_tunnel_init_net(net, erspan_net_id,
1590                                  &erspan_link_ops, "erspan0");
1591}
1592
1593static void __net_exit erspan_exit_batch_net(struct list_head *net_list)
1594{
1595        ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops);
1596}
1597
1598static struct pernet_operations erspan_net_ops = {
1599        .init = erspan_init_net,
1600        .exit_batch = erspan_exit_batch_net,
1601        .id   = &erspan_net_id,
1602        .size = sizeof(struct ip_tunnel_net),
1603};
1604
1605static int __init ipgre_init(void)
1606{
1607        int err;
1608
1609        pr_info("GRE over IPv4 tunneling driver\n");
1610
1611        err = register_pernet_device(&ipgre_net_ops);
1612        if (err < 0)
1613                return err;
1614
1615        err = register_pernet_device(&ipgre_tap_net_ops);
1616        if (err < 0)
1617                goto pnet_tap_failed;
1618
1619        err = register_pernet_device(&erspan_net_ops);
1620        if (err < 0)
1621                goto pnet_erspan_failed;
1622
1623        err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1624        if (err < 0) {
1625                pr_info("%s: can't add protocol\n", __func__);
1626                goto add_proto_failed;
1627        }
1628
1629        err = rtnl_link_register(&ipgre_link_ops);
1630        if (err < 0)
1631                goto rtnl_link_failed;
1632
1633        err = rtnl_link_register(&ipgre_tap_ops);
1634        if (err < 0)
1635                goto tap_ops_failed;
1636
1637        err = rtnl_link_register(&erspan_link_ops);
1638        if (err < 0)
1639                goto erspan_link_failed;
1640
1641        return 0;
1642
1643erspan_link_failed:
1644        rtnl_link_unregister(&ipgre_tap_ops);
1645tap_ops_failed:
1646        rtnl_link_unregister(&ipgre_link_ops);
1647rtnl_link_failed:
1648        gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1649add_proto_failed:
1650        unregister_pernet_device(&erspan_net_ops);
1651pnet_erspan_failed:
1652        unregister_pernet_device(&ipgre_tap_net_ops);
1653pnet_tap_failed:
1654        unregister_pernet_device(&ipgre_net_ops);
1655        return err;
1656}
1657
1658static void __exit ipgre_fini(void)
1659{
1660        rtnl_link_unregister(&ipgre_tap_ops);
1661        rtnl_link_unregister(&ipgre_link_ops);
1662        rtnl_link_unregister(&erspan_link_ops);
1663        gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1664        unregister_pernet_device(&ipgre_tap_net_ops);
1665        unregister_pernet_device(&ipgre_net_ops);
1666        unregister_pernet_device(&erspan_net_ops);
1667}
1668
1669module_init(ipgre_init);
1670module_exit(ipgre_fini);
1671MODULE_LICENSE("GPL");
1672MODULE_ALIAS_RTNL_LINK("gre");
1673MODULE_ALIAS_RTNL_LINK("gretap");
1674MODULE_ALIAS_RTNL_LINK("erspan");
1675MODULE_ALIAS_NETDEV("gre0");
1676MODULE_ALIAS_NETDEV("gretap0");
1677MODULE_ALIAS_NETDEV("erspan0");
1678