linux/net/ipv4/ip_gre.c
<<
>>
Prefs
   1/*
   2 *      Linux NET3:     GRE over IP protocol decoder.
   3 *
   4 *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
   5 *
   6 *      This program is free software; you can redistribute it and/or
   7 *      modify it under the terms of the GNU General Public License
   8 *      as published by the Free Software Foundation; either version
   9 *      2 of the License, or (at your option) any later version.
  10 *
  11 */
  12
  13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  14
  15#include <linux/capability.h>
  16#include <linux/module.h>
  17#include <linux/types.h>
  18#include <linux/kernel.h>
  19#include <linux/slab.h>
  20#include <linux/uaccess.h>
  21#include <linux/skbuff.h>
  22#include <linux/netdevice.h>
  23#include <linux/in.h>
  24#include <linux/tcp.h>
  25#include <linux/udp.h>
  26#include <linux/if_arp.h>
  27#include <linux/if_vlan.h>
  28#include <linux/init.h>
  29#include <linux/in6.h>
  30#include <linux/inetdevice.h>
  31#include <linux/igmp.h>
  32#include <linux/netfilter_ipv4.h>
  33#include <linux/etherdevice.h>
  34#include <linux/if_ether.h>
  35
  36#include <net/sock.h>
  37#include <net/ip.h>
  38#include <net/icmp.h>
  39#include <net/protocol.h>
  40#include <net/ip_tunnels.h>
  41#include <net/arp.h>
  42#include <net/checksum.h>
  43#include <net/dsfield.h>
  44#include <net/inet_ecn.h>
  45#include <net/xfrm.h>
  46#include <net/net_namespace.h>
  47#include <net/netns/generic.h>
  48#include <net/rtnetlink.h>
  49#include <net/gre.h>
  50#include <net/dst_metadata.h>
  51#include <net/erspan.h>
  52
  53/*
  54   Problems & solutions
  55   --------------------
  56
  57   1. The most important issue is detecting local dead loops.
  58   They would cause complete host lockup in transmit, which
  59   would be "resolved" by stack overflow or, if queueing is enabled,
  60   with infinite looping in net_bh.
  61
  62   We cannot track such dead loops during route installation,
  63   it is infeasible task. The most general solutions would be
  64   to keep skb->encapsulation counter (sort of local ttl),
  65   and silently drop packet when it expires. It is a good
  66   solution, but it supposes maintaining new variable in ALL
  67   skb, even if no tunneling is used.
  68
  69   Current solution: xmit_recursion breaks dead loops. This is a percpu
  70   counter, since when we enter the first ndo_xmit(), cpu migration is
  71   forbidden. We force an exit if this counter reaches RECURSION_LIMIT
  72
  73   2. Networking dead loops would not kill routers, but would really
  74   kill network. IP hop limit plays role of "t->recursion" in this case,
  75   if we copy it from packet being encapsulated to upper header.
  76   It is very good solution, but it introduces two problems:
  77
  78   - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
  79     do not work over tunnels.
  80   - traceroute does not work. I planned to relay ICMP from tunnel,
  81     so that this problem would be solved and traceroute output
  82     would even more informative. This idea appeared to be wrong:
  83     only Linux complies to rfc1812 now (yes, guys, Linux is the only
  84     true router now :-)), all routers (at least, in neighbourhood of mine)
  85     return only 8 bytes of payload. It is the end.
  86
  87   Hence, if we want that OSPF worked or traceroute said something reasonable,
  88   we should search for another solution.
  89
  90   One of them is to parse packet trying to detect inner encapsulation
  91   made by our node. It is difficult or even impossible, especially,
  92   taking into account fragmentation. TO be short, ttl is not solution at all.
  93
  94   Current solution: The solution was UNEXPECTEDLY SIMPLE.
  95   We force DF flag on tunnels with preconfigured hop limit,
  96   that is ALL. :-) Well, it does not remove the problem completely,
  97   but exponential growth of network traffic is changed to linear
  98   (branches, that exceed pmtu are pruned) and tunnel mtu
  99   rapidly degrades to value <68, where looping stops.
 100   Yes, it is not good if there exists a router in the loop,
 101   which does not force DF, even when encapsulating packets have DF set.
 102   But it is not our problem! Nobody could accuse us, we made
 103   all that we could make. Even if it is your gated who injected
 104   fatal route to network, even if it were you who configured
 105   fatal static route: you are innocent. :-)
 106
 107   Alexey Kuznetsov.
 108 */
 109
 110static bool log_ecn_error = true;
 111module_param(log_ecn_error, bool, 0644);
 112MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 113
 114static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 115static int ipgre_tunnel_init(struct net_device *dev);
 116static void erspan_build_header(struct sk_buff *skb,
 117                                u32 id, u32 index,
 118                                bool truncate, bool is_ipv4);
 119
 120static unsigned int ipgre_net_id __read_mostly;
 121static unsigned int gre_tap_net_id __read_mostly;
 122static unsigned int erspan_net_id __read_mostly;
 123
 124static void ipgre_err(struct sk_buff *skb, u32 info,
 125                      const struct tnl_ptk_info *tpi)
 126{
 127
 128        /* All the routers (except for Linux) return only
 129           8 bytes of packet payload. It means, that precise relaying of
 130           ICMP in the real Internet is absolutely infeasible.
 131
 132           Moreover, Cisco "wise men" put GRE key to the third word
 133           in GRE header. It makes impossible maintaining even soft
 134           state for keyed GRE tunnels with enabled checksum. Tell
 135           them "thank you".
 136
 137           Well, I wonder, rfc1812 was written by Cisco employee,
 138           what the hell these idiots break standards established
 139           by themselves???
 140           */
 141        struct net *net = dev_net(skb->dev);
 142        struct ip_tunnel_net *itn;
 143        const struct iphdr *iph;
 144        const int type = icmp_hdr(skb)->type;
 145        const int code = icmp_hdr(skb)->code;
 146        unsigned int data_len = 0;
 147        struct ip_tunnel *t;
 148
 149        switch (type) {
 150        default:
 151        case ICMP_PARAMETERPROB:
 152                return;
 153
 154        case ICMP_DEST_UNREACH:
 155                switch (code) {
 156                case ICMP_SR_FAILED:
 157                case ICMP_PORT_UNREACH:
 158                        /* Impossible event. */
 159                        return;
 160                default:
 161                        /* All others are translated to HOST_UNREACH.
 162                           rfc2003 contains "deep thoughts" about NET_UNREACH,
 163                           I believe they are just ether pollution. --ANK
 164                         */
 165                        break;
 166                }
 167                break;
 168
 169        case ICMP_TIME_EXCEEDED:
 170                if (code != ICMP_EXC_TTL)
 171                        return;
 172                data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
 173                break;
 174
 175        case ICMP_REDIRECT:
 176                break;
 177        }
 178
 179        if (tpi->proto == htons(ETH_P_TEB))
 180                itn = net_generic(net, gre_tap_net_id);
 181        else
 182                itn = net_generic(net, ipgre_net_id);
 183
 184        iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
 185        t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
 186                             iph->daddr, iph->saddr, tpi->key);
 187
 188        if (!t)
 189                return;
 190
 191#if IS_ENABLED(CONFIG_IPV6)
 192       if (tpi->proto == htons(ETH_P_IPV6) &&
 193           !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
 194                                       type, data_len))
 195               return;
 196#endif
 197
 198        if (t->parms.iph.daddr == 0 ||
 199            ipv4_is_multicast(t->parms.iph.daddr))
 200                return;
 201
 202        if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
 203                return;
 204
 205        if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
 206                t->err_count++;
 207        else
 208                t->err_count = 1;
 209        t->err_time = jiffies;
 210}
 211
 212static void gre_err(struct sk_buff *skb, u32 info)
 213{
 214        /* All the routers (except for Linux) return only
 215         * 8 bytes of packet payload. It means, that precise relaying of
 216         * ICMP in the real Internet is absolutely infeasible.
 217         *
 218         * Moreover, Cisco "wise men" put GRE key to the third word
 219         * in GRE header. It makes impossible maintaining even soft
 220         * state for keyed
 221         * GRE tunnels with enabled checksum. Tell them "thank you".
 222         *
 223         * Well, I wonder, rfc1812 was written by Cisco employee,
 224         * what the hell these idiots break standards established
 225         * by themselves???
 226         */
 227
 228        const struct iphdr *iph = (struct iphdr *)skb->data;
 229        const int type = icmp_hdr(skb)->type;
 230        const int code = icmp_hdr(skb)->code;
 231        struct tnl_ptk_info tpi;
 232        bool csum_err = false;
 233
 234        if (gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP),
 235                             iph->ihl * 4) < 0) {
 236                if (!csum_err)          /* ignore csum errors. */
 237                        return;
 238        }
 239
 240        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
 241                ipv4_update_pmtu(skb, dev_net(skb->dev), info,
 242                                 skb->dev->ifindex, 0, IPPROTO_GRE, 0);
 243                return;
 244        }
 245        if (type == ICMP_REDIRECT) {
 246                ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
 247                              IPPROTO_GRE, 0);
 248                return;
 249        }
 250
 251        ipgre_err(skb, info, &tpi);
 252}
 253
 254static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 255                      int gre_hdr_len)
 256{
 257        struct net *net = dev_net(skb->dev);
 258        struct metadata_dst *tun_dst = NULL;
 259        struct erspan_base_hdr *ershdr;
 260        struct erspan_metadata *pkt_md;
 261        struct ip_tunnel_net *itn;
 262        struct ip_tunnel *tunnel;
 263        const struct iphdr *iph;
 264        struct erspan_md2 *md2;
 265        int ver;
 266        int len;
 267
 268        itn = net_generic(net, erspan_net_id);
 269        len = gre_hdr_len + sizeof(*ershdr);
 270
 271        /* Check based hdr len */
 272        if (unlikely(!pskb_may_pull(skb, len)))
 273                return PACKET_REJECT;
 274
 275        iph = ip_hdr(skb);
 276        ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
 277        ver = ershdr->ver;
 278
 279        /* The original GRE header does not have key field,
 280         * Use ERSPAN 10-bit session ID as key.
 281         */
 282        tpi->key = cpu_to_be32(get_session_id(ershdr));
 283        tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
 284                                  tpi->flags | TUNNEL_KEY,
 285                                  iph->saddr, iph->daddr, tpi->key);
 286
 287        if (tunnel) {
 288                len = gre_hdr_len + erspan_hdr_len(ver);
 289                if (unlikely(!pskb_may_pull(skb, len)))
 290                        return PACKET_REJECT;
 291
 292                ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
 293                pkt_md = (struct erspan_metadata *)(ershdr + 1);
 294
 295                if (__iptunnel_pull_header(skb,
 296                                           len,
 297                                           htons(ETH_P_TEB),
 298                                           false, false) < 0)
 299                        goto drop;
 300
 301                if (tunnel->collect_md) {
 302                        struct ip_tunnel_info *info;
 303                        struct erspan_metadata *md;
 304                        __be64 tun_id;
 305                        __be16 flags;
 306
 307                        tpi->flags |= TUNNEL_KEY;
 308                        flags = tpi->flags;
 309                        tun_id = key32_to_tunnel_id(tpi->key);
 310
 311                        tun_dst = ip_tun_rx_dst(skb, flags,
 312                                                tun_id, sizeof(*md));
 313                        if (!tun_dst)
 314                                return PACKET_REJECT;
 315
 316                        md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
 317                        md->version = ver;
 318                        md2 = &md->u.md2;
 319                        memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE :
 320                                                       ERSPAN_V2_MDSIZE);
 321
 322                        info = &tun_dst->u.tun_info;
 323                        info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
 324                        info->options_len = sizeof(*md);
 325                }
 326
 327                skb_reset_mac_header(skb);
 328                ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
 329                return PACKET_RCVD;
 330        }
 331drop:
 332        kfree_skb(skb);
 333        return PACKET_RCVD;
 334}
 335
 336static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 337                       struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
 338{
 339        struct metadata_dst *tun_dst = NULL;
 340        const struct iphdr *iph;
 341        struct ip_tunnel *tunnel;
 342
 343        iph = ip_hdr(skb);
 344        tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
 345                                  iph->saddr, iph->daddr, tpi->key);
 346
 347        if (tunnel) {
 348                if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
 349                                           raw_proto, false) < 0)
 350                        goto drop;
 351
 352                if (tunnel->dev->type != ARPHRD_NONE)
 353                        skb_pop_mac_header(skb);
 354                else
 355                        skb_reset_mac_header(skb);
 356                if (tunnel->collect_md) {
 357                        __be16 flags;
 358                        __be64 tun_id;
 359
 360                        flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
 361                        tun_id = key32_to_tunnel_id(tpi->key);
 362                        tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
 363                        if (!tun_dst)
 364                                return PACKET_REJECT;
 365                }
 366
 367                ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
 368                return PACKET_RCVD;
 369        }
 370        return PACKET_NEXT;
 371
 372drop:
 373        kfree_skb(skb);
 374        return PACKET_RCVD;
 375}
 376
 377static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 378                     int hdr_len)
 379{
 380        struct net *net = dev_net(skb->dev);
 381        struct ip_tunnel_net *itn;
 382        int res;
 383
 384        if (tpi->proto == htons(ETH_P_TEB))
 385                itn = net_generic(net, gre_tap_net_id);
 386        else
 387                itn = net_generic(net, ipgre_net_id);
 388
 389        res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
 390        if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
 391                /* ipgre tunnels in collect metadata mode should receive
 392                 * also ETH_P_TEB traffic.
 393                 */
 394                itn = net_generic(net, ipgre_net_id);
 395                res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
 396        }
 397        return res;
 398}
 399
 400static int gre_rcv(struct sk_buff *skb)
 401{
 402        struct tnl_ptk_info tpi;
 403        bool csum_err = false;
 404        int hdr_len;
 405
 406#ifdef CONFIG_NET_IPGRE_BROADCAST
 407        if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
 408                /* Looped back packet, drop it! */
 409                if (rt_is_output_route(skb_rtable(skb)))
 410                        goto drop;
 411        }
 412#endif
 413
 414        hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
 415        if (hdr_len < 0)
 416                goto drop;
 417
 418        if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
 419                     tpi.proto == htons(ETH_P_ERSPAN2))) {
 420                if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
 421                        return 0;
 422                goto out;
 423        }
 424
 425        if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
 426                return 0;
 427
 428out:
 429        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 430drop:
 431        kfree_skb(skb);
 432        return 0;
 433}
 434
 435static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
 436                       const struct iphdr *tnl_params,
 437                       __be16 proto)
 438{
 439        struct ip_tunnel *tunnel = netdev_priv(dev);
 440
 441        if (tunnel->parms.o_flags & TUNNEL_SEQ)
 442                tunnel->o_seqno++;
 443
 444        /* Push GRE header. */
 445        gre_build_header(skb, tunnel->tun_hlen,
 446                         tunnel->parms.o_flags, proto, tunnel->parms.o_key,
 447                         htonl(tunnel->o_seqno));
 448
 449        ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
 450}
 451
 452static int gre_handle_offloads(struct sk_buff *skb, bool csum)
 453{
 454        return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
 455}
 456
 457static struct rtable *gre_get_rt(struct sk_buff *skb,
 458                                 struct net_device *dev,
 459                                 struct flowi4 *fl,
 460                                 const struct ip_tunnel_key *key)
 461{
 462        struct net *net = dev_net(dev);
 463
 464        memset(fl, 0, sizeof(*fl));
 465        fl->daddr = key->u.ipv4.dst;
 466        fl->saddr = key->u.ipv4.src;
 467        fl->flowi4_tos = RT_TOS(key->tos);
 468        fl->flowi4_mark = skb->mark;
 469        fl->flowi4_proto = IPPROTO_GRE;
 470
 471        return ip_route_output_key(net, fl);
 472}
 473
 474static struct rtable *prepare_fb_xmit(struct sk_buff *skb,
 475                                      struct net_device *dev,
 476                                      struct flowi4 *fl,
 477                                      int tunnel_hlen)
 478{
 479        struct ip_tunnel_info *tun_info;
 480        const struct ip_tunnel_key *key;
 481        struct rtable *rt = NULL;
 482        int min_headroom;
 483        bool use_cache;
 484        int err;
 485
 486        tun_info = skb_tunnel_info(skb);
 487        key = &tun_info->key;
 488        use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
 489
 490        if (use_cache)
 491                rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl->saddr);
 492        if (!rt) {
 493                rt = gre_get_rt(skb, dev, fl, key);
 494                if (IS_ERR(rt))
 495                        goto err_free_skb;
 496                if (use_cache)
 497                        dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
 498                                          fl->saddr);
 499        }
 500
 501        min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
 502                        + tunnel_hlen + sizeof(struct iphdr);
 503        if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
 504                int head_delta = SKB_DATA_ALIGN(min_headroom -
 505                                                skb_headroom(skb) +
 506                                                16);
 507                err = pskb_expand_head(skb, max_t(int, head_delta, 0),
 508                                       0, GFP_ATOMIC);
 509                if (unlikely(err))
 510                        goto err_free_rt;
 511        }
 512        return rt;
 513
 514err_free_rt:
 515        ip_rt_put(rt);
 516err_free_skb:
 517        kfree_skb(skb);
 518        dev->stats.tx_dropped++;
 519        return NULL;
 520}
 521
 522static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 523                        __be16 proto)
 524{
 525        struct ip_tunnel_info *tun_info;
 526        const struct ip_tunnel_key *key;
 527        struct rtable *rt = NULL;
 528        struct flowi4 fl;
 529        int tunnel_hlen;
 530        __be16 df, flags;
 531
 532        tun_info = skb_tunnel_info(skb);
 533        if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
 534                     ip_tunnel_info_af(tun_info) != AF_INET))
 535                goto err_free_skb;
 536
 537        key = &tun_info->key;
 538        tunnel_hlen = gre_calc_hlen(key->tun_flags);
 539
 540        rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
 541        if (!rt)
 542                return;
 543
 544        /* Push Tunnel header. */
 545        if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
 546                goto err_free_rt;
 547
 548        flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
 549        gre_build_header(skb, tunnel_hlen, flags, proto,
 550                         tunnel_id_to_key32(tun_info->key.tun_id), 0);
 551
 552        df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
 553
 554        iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
 555                      key->tos, key->ttl, df, false);
 556        return;
 557
 558err_free_rt:
 559        ip_rt_put(rt);
 560err_free_skb:
 561        kfree_skb(skb);
 562        dev->stats.tx_dropped++;
 563}
 564
 565static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 566                           __be16 proto)
 567{
 568        struct ip_tunnel *tunnel = netdev_priv(dev);
 569        struct ip_tunnel_info *tun_info;
 570        const struct ip_tunnel_key *key;
 571        struct erspan_metadata *md;
 572        struct rtable *rt = NULL;
 573        bool truncate = false;
 574        struct flowi4 fl;
 575        int tunnel_hlen;
 576        int version;
 577        __be16 df;
 578
 579        tun_info = skb_tunnel_info(skb);
 580        if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
 581                     ip_tunnel_info_af(tun_info) != AF_INET))
 582                goto err_free_skb;
 583
 584        key = &tun_info->key;
 585        md = ip_tunnel_info_opts(tun_info);
 586        if (!md)
 587                goto err_free_rt;
 588
 589        /* ERSPAN has fixed 8 byte GRE header */
 590        version = md->version;
 591        tunnel_hlen = 8 + erspan_hdr_len(version);
 592
 593        rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
 594        if (!rt)
 595                return;
 596
 597        if (gre_handle_offloads(skb, false))
 598                goto err_free_rt;
 599
 600        if (skb->len > dev->mtu + dev->hard_header_len) {
 601                pskb_trim(skb, dev->mtu + dev->hard_header_len);
 602                truncate = true;
 603        }
 604
 605        if (version == 1) {
 606                erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
 607                                    ntohl(md->u.index), truncate, true);
 608        } else if (version == 2) {
 609                erspan_build_header_v2(skb,
 610                                       ntohl(tunnel_id_to_key32(key->tun_id)),
 611                                       md->u.md2.dir,
 612                                       get_hwid(&md->u.md2),
 613                                       truncate, true);
 614        } else {
 615                goto err_free_rt;
 616        }
 617
 618        gre_build_header(skb, 8, TUNNEL_SEQ,
 619                         htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++));
 620
 621        df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
 622
 623        iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
 624                      key->tos, key->ttl, df, false);
 625        return;
 626
 627err_free_rt:
 628        ip_rt_put(rt);
 629err_free_skb:
 630        kfree_skb(skb);
 631        dev->stats.tx_dropped++;
 632}
 633
 634static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 635{
 636        struct ip_tunnel_info *info = skb_tunnel_info(skb);
 637        struct rtable *rt;
 638        struct flowi4 fl4;
 639
 640        if (ip_tunnel_info_af(info) != AF_INET)
 641                return -EINVAL;
 642
 643        rt = gre_get_rt(skb, dev, &fl4, &info->key);
 644        if (IS_ERR(rt))
 645                return PTR_ERR(rt);
 646
 647        ip_rt_put(rt);
 648        info->key.u.ipv4.src = fl4.saddr;
 649        return 0;
 650}
 651
 652static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
 653                              struct net_device *dev)
 654{
 655        struct ip_tunnel *tunnel = netdev_priv(dev);
 656        const struct iphdr *tnl_params;
 657
 658        if (tunnel->collect_md) {
 659                gre_fb_xmit(skb, dev, skb->protocol);
 660                return NETDEV_TX_OK;
 661        }
 662
 663        if (dev->header_ops) {
 664                /* Need space for new headers */
 665                if (skb_cow_head(skb, dev->needed_headroom -
 666                                      (tunnel->hlen + sizeof(struct iphdr))))
 667                        goto free_skb;
 668
 669                tnl_params = (const struct iphdr *)skb->data;
 670
 671                /* Pull skb since ip_tunnel_xmit() needs skb->data pointing
 672                 * to gre header.
 673                 */
 674                skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
 675                skb_reset_mac_header(skb);
 676        } else {
 677                if (skb_cow_head(skb, dev->needed_headroom))
 678                        goto free_skb;
 679
 680                tnl_params = &tunnel->parms.iph;
 681        }
 682
 683        if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
 684                goto free_skb;
 685
 686        __gre_xmit(skb, dev, tnl_params, skb->protocol);
 687        return NETDEV_TX_OK;
 688
 689free_skb:
 690        kfree_skb(skb);
 691        dev->stats.tx_dropped++;
 692        return NETDEV_TX_OK;
 693}
 694
 695static netdev_tx_t erspan_xmit(struct sk_buff *skb,
 696                               struct net_device *dev)
 697{
 698        struct ip_tunnel *tunnel = netdev_priv(dev);
 699        bool truncate = false;
 700
 701        if (tunnel->collect_md) {
 702                erspan_fb_xmit(skb, dev, skb->protocol);
 703                return NETDEV_TX_OK;
 704        }
 705
 706        if (gre_handle_offloads(skb, false))
 707                goto free_skb;
 708
 709        if (skb_cow_head(skb, dev->needed_headroom))
 710                goto free_skb;
 711
 712        if (skb->len > dev->mtu + dev->hard_header_len) {
 713                pskb_trim(skb, dev->mtu + dev->hard_header_len);
 714                truncate = true;
 715        }
 716
 717        /* Push ERSPAN header */
 718        if (tunnel->erspan_ver == 1)
 719                erspan_build_header(skb, ntohl(tunnel->parms.o_key),
 720                                    tunnel->index,
 721                                    truncate, true);
 722        else
 723                erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key),
 724                                       tunnel->dir, tunnel->hwid,
 725                                       truncate, true);
 726
 727        tunnel->parms.o_flags &= ~TUNNEL_KEY;
 728        __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
 729        return NETDEV_TX_OK;
 730
 731free_skb:
 732        kfree_skb(skb);
 733        dev->stats.tx_dropped++;
 734        return NETDEV_TX_OK;
 735}
 736
 737static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
 738                                struct net_device *dev)
 739{
 740        struct ip_tunnel *tunnel = netdev_priv(dev);
 741
 742        if (tunnel->collect_md) {
 743                gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
 744                return NETDEV_TX_OK;
 745        }
 746
 747        if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
 748                goto free_skb;
 749
 750        if (skb_cow_head(skb, dev->needed_headroom))
 751                goto free_skb;
 752
 753        __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
 754        return NETDEV_TX_OK;
 755
 756free_skb:
 757        kfree_skb(skb);
 758        dev->stats.tx_dropped++;
 759        return NETDEV_TX_OK;
 760}
 761
 762static void ipgre_link_update(struct net_device *dev, bool set_mtu)
 763{
 764        struct ip_tunnel *tunnel = netdev_priv(dev);
 765        int len;
 766
 767        len = tunnel->tun_hlen;
 768        tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
 769        len = tunnel->tun_hlen - len;
 770        tunnel->hlen = tunnel->hlen + len;
 771
 772        dev->needed_headroom = dev->needed_headroom + len;
 773        if (set_mtu)
 774                dev->mtu = max_t(int, dev->mtu - len, 68);
 775
 776        if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
 777                if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
 778                    tunnel->encap.type == TUNNEL_ENCAP_NONE) {
 779                        dev->features |= NETIF_F_GSO_SOFTWARE;
 780                        dev->hw_features |= NETIF_F_GSO_SOFTWARE;
 781                }
 782                dev->features |= NETIF_F_LLTX;
 783        }
 784}
 785
 786static int ipgre_tunnel_ioctl(struct net_device *dev,
 787                              struct ifreq *ifr, int cmd)
 788{
 789        struct ip_tunnel_parm p;
 790        int err;
 791
 792        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
 793                return -EFAULT;
 794
 795        if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
 796                if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
 797                    p.iph.ihl != 5 || (p.iph.frag_off & htons(~IP_DF)) ||
 798                    ((p.i_flags | p.o_flags) & (GRE_VERSION | GRE_ROUTING)))
 799                        return -EINVAL;
 800        }
 801
 802        p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
 803        p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
 804
 805        err = ip_tunnel_ioctl(dev, &p, cmd);
 806        if (err)
 807                return err;
 808
 809        if (cmd == SIOCCHGTUNNEL) {
 810                struct ip_tunnel *t = netdev_priv(dev);
 811
 812                t->parms.i_flags = p.i_flags;
 813                t->parms.o_flags = p.o_flags;
 814
 815                if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
 816                        ipgre_link_update(dev, true);
 817        }
 818
 819        p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags);
 820        p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags);
 821
 822        if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
 823                return -EFAULT;
 824
 825        return 0;
 826}
 827
 828/* Nice toy. Unfortunately, useless in real life :-)
 829   It allows to construct virtual multiprotocol broadcast "LAN"
 830   over the Internet, provided multicast routing is tuned.
 831
 832
 833   I have no idea was this bicycle invented before me,
 834   so that I had to set ARPHRD_IPGRE to a random value.
 835   I have an impression, that Cisco could make something similar,
 836   but this feature is apparently missing in IOS<=11.2(8).
 837
 838   I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
 839   with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
 840
 841   ping -t 255 224.66.66.66
 842
 843   If nobody answers, mbone does not work.
 844
 845   ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
 846   ip addr add 10.66.66.<somewhat>/24 dev Universe
 847   ifconfig Universe up
 848   ifconfig Universe add fe80::<Your_real_addr>/10
 849   ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
 850   ftp 10.66.66.66
 851   ...
 852   ftp fec0:6666:6666::193.233.7.65
 853   ...
 854 */
 855static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
 856                        unsigned short type,
 857                        const void *daddr, const void *saddr, unsigned int len)
 858{
 859        struct ip_tunnel *t = netdev_priv(dev);
 860        struct iphdr *iph;
 861        struct gre_base_hdr *greh;
 862
 863        iph = skb_push(skb, t->hlen + sizeof(*iph));
 864        greh = (struct gre_base_hdr *)(iph+1);
 865        greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
 866        greh->protocol = htons(type);
 867
 868        memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
 869
 870        /* Set the source hardware address. */
 871        if (saddr)
 872                memcpy(&iph->saddr, saddr, 4);
 873        if (daddr)
 874                memcpy(&iph->daddr, daddr, 4);
 875        if (iph->daddr)
 876                return t->hlen + sizeof(*iph);
 877
 878        return -(t->hlen + sizeof(*iph));
 879}
 880
 881static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
 882{
 883        const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
 884        memcpy(haddr, &iph->saddr, 4);
 885        return 4;
 886}
 887
 888static const struct header_ops ipgre_header_ops = {
 889        .create = ipgre_header,
 890        .parse  = ipgre_header_parse,
 891};
 892
 893#ifdef CONFIG_NET_IPGRE_BROADCAST
 894static int ipgre_open(struct net_device *dev)
 895{
 896        struct ip_tunnel *t = netdev_priv(dev);
 897
 898        if (ipv4_is_multicast(t->parms.iph.daddr)) {
 899                struct flowi4 fl4;
 900                struct rtable *rt;
 901
 902                rt = ip_route_output_gre(t->net, &fl4,
 903                                         t->parms.iph.daddr,
 904                                         t->parms.iph.saddr,
 905                                         t->parms.o_key,
 906                                         RT_TOS(t->parms.iph.tos),
 907                                         t->parms.link);
 908                if (IS_ERR(rt))
 909                        return -EADDRNOTAVAIL;
 910                dev = rt->dst.dev;
 911                ip_rt_put(rt);
 912                if (!__in_dev_get_rtnl(dev))
 913                        return -EADDRNOTAVAIL;
 914                t->mlink = dev->ifindex;
 915                ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
 916        }
 917        return 0;
 918}
 919
 920static int ipgre_close(struct net_device *dev)
 921{
 922        struct ip_tunnel *t = netdev_priv(dev);
 923
 924        if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
 925                struct in_device *in_dev;
 926                in_dev = inetdev_by_index(t->net, t->mlink);
 927                if (in_dev)
 928                        ip_mc_dec_group(in_dev, t->parms.iph.daddr);
 929        }
 930        return 0;
 931}
 932#endif
 933
 934static const struct net_device_ops ipgre_netdev_ops = {
 935        .ndo_init               = ipgre_tunnel_init,
 936        .ndo_uninit             = ip_tunnel_uninit,
 937#ifdef CONFIG_NET_IPGRE_BROADCAST
 938        .ndo_open               = ipgre_open,
 939        .ndo_stop               = ipgre_close,
 940#endif
 941        .ndo_start_xmit         = ipgre_xmit,
 942        .ndo_do_ioctl           = ipgre_tunnel_ioctl,
 943        .ndo_change_mtu         = ip_tunnel_change_mtu,
 944        .ndo_get_stats64        = ip_tunnel_get_stats64,
 945        .ndo_get_iflink         = ip_tunnel_get_iflink,
 946};
 947
 948#define GRE_FEATURES (NETIF_F_SG |              \
 949                      NETIF_F_FRAGLIST |        \
 950                      NETIF_F_HIGHDMA |         \
 951                      NETIF_F_HW_CSUM)
 952
 953static void ipgre_tunnel_setup(struct net_device *dev)
 954{
 955        dev->netdev_ops         = &ipgre_netdev_ops;
 956        dev->type               = ARPHRD_IPGRE;
 957        ip_tunnel_setup(dev, ipgre_net_id);
 958}
 959
 960static void __gre_tunnel_init(struct net_device *dev)
 961{
 962        struct ip_tunnel *tunnel;
 963        int t_hlen;
 964
 965        tunnel = netdev_priv(dev);
 966        tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
 967        tunnel->parms.iph.protocol = IPPROTO_GRE;
 968
 969        tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
 970
 971        t_hlen = tunnel->hlen + sizeof(struct iphdr);
 972
 973        dev->features           |= GRE_FEATURES;
 974        dev->hw_features        |= GRE_FEATURES;
 975
 976        if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
 977                /* TCP offload with GRE SEQ is not supported, nor
 978                 * can we support 2 levels of outer headers requiring
 979                 * an update.
 980                 */
 981                if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
 982                    (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
 983                        dev->features    |= NETIF_F_GSO_SOFTWARE;
 984                        dev->hw_features |= NETIF_F_GSO_SOFTWARE;
 985                }
 986
 987                /* Can use a lockless transmit, unless we generate
 988                 * output sequences
 989                 */
 990                dev->features |= NETIF_F_LLTX;
 991        }
 992}
 993
 994static int ipgre_tunnel_init(struct net_device *dev)
 995{
 996        struct ip_tunnel *tunnel = netdev_priv(dev);
 997        struct iphdr *iph = &tunnel->parms.iph;
 998
 999        __gre_tunnel_init(dev);
1000
1001        memcpy(dev->dev_addr, &iph->saddr, 4);
1002        memcpy(dev->broadcast, &iph->daddr, 4);
1003
1004        dev->flags              = IFF_NOARP;
1005        netif_keep_dst(dev);
1006        dev->addr_len           = 4;
1007
1008        if (iph->daddr && !tunnel->collect_md) {
1009#ifdef CONFIG_NET_IPGRE_BROADCAST
1010                if (ipv4_is_multicast(iph->daddr)) {
1011                        if (!iph->saddr)
1012                                return -EINVAL;
1013                        dev->flags = IFF_BROADCAST;
1014                        dev->header_ops = &ipgre_header_ops;
1015                }
1016#endif
1017        } else if (!tunnel->collect_md) {
1018                dev->header_ops = &ipgre_header_ops;
1019        }
1020
1021        return ip_tunnel_init(dev);
1022}
1023
1024static const struct gre_protocol ipgre_protocol = {
1025        .handler     = gre_rcv,
1026        .err_handler = gre_err,
1027};
1028
1029static int __net_init ipgre_init_net(struct net *net)
1030{
1031        return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
1032}
1033
1034static void __net_exit ipgre_exit_batch_net(struct list_head *list_net)
1035{
1036        ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops);
1037}
1038
1039static struct pernet_operations ipgre_net_ops = {
1040        .init = ipgre_init_net,
1041        .exit_batch = ipgre_exit_batch_net,
1042        .id   = &ipgre_net_id,
1043        .size = sizeof(struct ip_tunnel_net),
1044};
1045
1046static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
1047                                 struct netlink_ext_ack *extack)
1048{
1049        __be16 flags;
1050
1051        if (!data)
1052                return 0;
1053
1054        flags = 0;
1055        if (data[IFLA_GRE_IFLAGS])
1056                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1057        if (data[IFLA_GRE_OFLAGS])
1058                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1059        if (flags & (GRE_VERSION|GRE_ROUTING))
1060                return -EINVAL;
1061
1062        if (data[IFLA_GRE_COLLECT_METADATA] &&
1063            data[IFLA_GRE_ENCAP_TYPE] &&
1064            nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE)
1065                return -EINVAL;
1066
1067        return 0;
1068}
1069
1070static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
1071                              struct netlink_ext_ack *extack)
1072{
1073        __be32 daddr;
1074
1075        if (tb[IFLA_ADDRESS]) {
1076                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1077                        return -EINVAL;
1078                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1079                        return -EADDRNOTAVAIL;
1080        }
1081
1082        if (!data)
1083                goto out;
1084
1085        if (data[IFLA_GRE_REMOTE]) {
1086                memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1087                if (!daddr)
1088                        return -EINVAL;
1089        }
1090
1091out:
1092        return ipgre_tunnel_validate(tb, data, extack);
1093}
1094
1095static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
1096                           struct netlink_ext_ack *extack)
1097{
1098        __be16 flags = 0;
1099        int ret;
1100
1101        if (!data)
1102                return 0;
1103
1104        ret = ipgre_tap_validate(tb, data, extack);
1105        if (ret)
1106                return ret;
1107
1108        /* ERSPAN should only have GRE sequence and key flag */
1109        if (data[IFLA_GRE_OFLAGS])
1110                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1111        if (data[IFLA_GRE_IFLAGS])
1112                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1113        if (!data[IFLA_GRE_COLLECT_METADATA] &&
1114            flags != (GRE_SEQ | GRE_KEY))
1115                return -EINVAL;
1116
1117        /* ERSPAN Session ID only has 10-bit. Since we reuse
1118         * 32-bit key field as ID, check it's range.
1119         */
1120        if (data[IFLA_GRE_IKEY] &&
1121            (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
1122                return -EINVAL;
1123
1124        if (data[IFLA_GRE_OKEY] &&
1125            (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
1126                return -EINVAL;
1127
1128        return 0;
1129}
1130
1131static int ipgre_netlink_parms(struct net_device *dev,
1132                                struct nlattr *data[],
1133                                struct nlattr *tb[],
1134                                struct ip_tunnel_parm *parms,
1135                                __u32 *fwmark)
1136{
1137        struct ip_tunnel *t = netdev_priv(dev);
1138
1139        memset(parms, 0, sizeof(*parms));
1140
1141        parms->iph.protocol = IPPROTO_GRE;
1142
1143        if (!data)
1144                return 0;
1145
1146        if (data[IFLA_GRE_LINK])
1147                parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1148
1149        if (data[IFLA_GRE_IFLAGS])
1150                parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
1151
1152        if (data[IFLA_GRE_OFLAGS])
1153                parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
1154
1155        if (data[IFLA_GRE_IKEY])
1156                parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1157
1158        if (data[IFLA_GRE_OKEY])
1159                parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1160
1161        if (data[IFLA_GRE_LOCAL])
1162                parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
1163
1164        if (data[IFLA_GRE_REMOTE])
1165                parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
1166
1167        if (data[IFLA_GRE_TTL])
1168                parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1169
1170        if (data[IFLA_GRE_TOS])
1171                parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1172
1173        if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) {
1174                if (t->ignore_df)
1175                        return -EINVAL;
1176                parms->iph.frag_off = htons(IP_DF);
1177        }
1178
1179        if (data[IFLA_GRE_COLLECT_METADATA]) {
1180                t->collect_md = true;
1181                if (dev->type == ARPHRD_IPGRE)
1182                        dev->type = ARPHRD_NONE;
1183        }
1184
1185        if (data[IFLA_GRE_IGNORE_DF]) {
1186                if (nla_get_u8(data[IFLA_GRE_IGNORE_DF])
1187                  && (parms->iph.frag_off & htons(IP_DF)))
1188                        return -EINVAL;
1189                t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
1190        }
1191
1192        if (data[IFLA_GRE_FWMARK])
1193                *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
1194
1195        if (data[IFLA_GRE_ERSPAN_VER]) {
1196                t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
1197
1198                if (t->erspan_ver != 1 && t->erspan_ver != 2)
1199                        return -EINVAL;
1200        }
1201
1202        if (t->erspan_ver == 1) {
1203                if (data[IFLA_GRE_ERSPAN_INDEX]) {
1204                        t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
1205                        if (t->index & ~INDEX_MASK)
1206                                return -EINVAL;
1207                }
1208        } else if (t->erspan_ver == 2) {
1209                if (data[IFLA_GRE_ERSPAN_DIR]) {
1210                        t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
1211                        if (t->dir & ~(DIR_MASK >> DIR_OFFSET))
1212                                return -EINVAL;
1213                }
1214                if (data[IFLA_GRE_ERSPAN_HWID]) {
1215                        t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
1216                        if (t->hwid & ~(HWID_MASK >> HWID_OFFSET))
1217                                return -EINVAL;
1218                }
1219        }
1220
1221        return 0;
1222}
1223
1224/* This function returns true when ENCAP attributes are present in the nl msg */
1225static bool ipgre_netlink_encap_parms(struct nlattr *data[],
1226                                      struct ip_tunnel_encap *ipencap)
1227{
1228        bool ret = false;
1229
1230        memset(ipencap, 0, sizeof(*ipencap));
1231
1232        if (!data)
1233                return ret;
1234
1235        if (data[IFLA_GRE_ENCAP_TYPE]) {
1236                ret = true;
1237                ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
1238        }
1239
1240        if (data[IFLA_GRE_ENCAP_FLAGS]) {
1241                ret = true;
1242                ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
1243        }
1244
1245        if (data[IFLA_GRE_ENCAP_SPORT]) {
1246                ret = true;
1247                ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
1248        }
1249
1250        if (data[IFLA_GRE_ENCAP_DPORT]) {
1251                ret = true;
1252                ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
1253        }
1254
1255        return ret;
1256}
1257
1258static int gre_tap_init(struct net_device *dev)
1259{
1260        __gre_tunnel_init(dev);
1261        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1262        netif_keep_dst(dev);
1263
1264        return ip_tunnel_init(dev);
1265}
1266
1267static const struct net_device_ops gre_tap_netdev_ops = {
1268        .ndo_init               = gre_tap_init,
1269        .ndo_uninit             = ip_tunnel_uninit,
1270        .ndo_start_xmit         = gre_tap_xmit,
1271        .ndo_set_mac_address    = eth_mac_addr,
1272        .ndo_validate_addr      = eth_validate_addr,
1273        .ndo_change_mtu         = ip_tunnel_change_mtu,
1274        .ndo_get_stats64        = ip_tunnel_get_stats64,
1275        .ndo_get_iflink         = ip_tunnel_get_iflink,
1276        .ndo_fill_metadata_dst  = gre_fill_metadata_dst,
1277};
1278
1279static int erspan_tunnel_init(struct net_device *dev)
1280{
1281        struct ip_tunnel *tunnel = netdev_priv(dev);
1282        int t_hlen;
1283
1284        tunnel->tun_hlen = 8;
1285        tunnel->parms.iph.protocol = IPPROTO_GRE;
1286        tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
1287                       erspan_hdr_len(tunnel->erspan_ver);
1288        t_hlen = tunnel->hlen + sizeof(struct iphdr);
1289
1290        dev->features           |= GRE_FEATURES;
1291        dev->hw_features        |= GRE_FEATURES;
1292        dev->priv_flags         |= IFF_LIVE_ADDR_CHANGE;
1293        netif_keep_dst(dev);
1294
1295        return ip_tunnel_init(dev);
1296}
1297
1298static const struct net_device_ops erspan_netdev_ops = {
1299        .ndo_init               = erspan_tunnel_init,
1300        .ndo_uninit             = ip_tunnel_uninit,
1301        .ndo_start_xmit         = erspan_xmit,
1302        .ndo_set_mac_address    = eth_mac_addr,
1303        .ndo_validate_addr      = eth_validate_addr,
1304        .ndo_change_mtu         = ip_tunnel_change_mtu,
1305        .ndo_get_stats64        = ip_tunnel_get_stats64,
1306        .ndo_get_iflink         = ip_tunnel_get_iflink,
1307        .ndo_fill_metadata_dst  = gre_fill_metadata_dst,
1308};
1309
1310static void ipgre_tap_setup(struct net_device *dev)
1311{
1312        ether_setup(dev);
1313        dev->max_mtu = 0;
1314        dev->netdev_ops = &gre_tap_netdev_ops;
1315        dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1316        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1317        ip_tunnel_setup(dev, gre_tap_net_id);
1318}
1319
1320static int ipgre_newlink(struct net *src_net, struct net_device *dev,
1321                         struct nlattr *tb[], struct nlattr *data[],
1322                         struct netlink_ext_ack *extack)
1323{
1324        struct ip_tunnel_parm p;
1325        struct ip_tunnel_encap ipencap;
1326        __u32 fwmark = 0;
1327        int err;
1328
1329        if (ipgre_netlink_encap_parms(data, &ipencap)) {
1330                struct ip_tunnel *t = netdev_priv(dev);
1331                err = ip_tunnel_encap_setup(t, &ipencap);
1332
1333                if (err < 0)
1334                        return err;
1335        }
1336
1337        err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1338        if (err < 0)
1339                return err;
1340        return ip_tunnel_newlink(dev, tb, &p, fwmark);
1341}
1342
1343static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1344                            struct nlattr *data[],
1345                            struct netlink_ext_ack *extack)
1346{
1347        struct ip_tunnel *t = netdev_priv(dev);
1348        struct ip_tunnel_encap ipencap;
1349        __u32 fwmark = t->fwmark;
1350        struct ip_tunnel_parm p;
1351        int err;
1352
1353        if (ipgre_netlink_encap_parms(data, &ipencap)) {
1354                err = ip_tunnel_encap_setup(t, &ipencap);
1355
1356                if (err < 0)
1357                        return err;
1358        }
1359
1360        err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1361        if (err < 0)
1362                return err;
1363
1364        err = ip_tunnel_changelink(dev, tb, &p, fwmark);
1365        if (err < 0)
1366                return err;
1367
1368        t->parms.i_flags = p.i_flags;
1369        t->parms.o_flags = p.o_flags;
1370
1371        if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
1372                ipgre_link_update(dev, !tb[IFLA_MTU]);
1373
1374        return 0;
1375}
1376
1377static size_t ipgre_get_size(const struct net_device *dev)
1378{
1379        return
1380                /* IFLA_GRE_LINK */
1381                nla_total_size(4) +
1382                /* IFLA_GRE_IFLAGS */
1383                nla_total_size(2) +
1384                /* IFLA_GRE_OFLAGS */
1385                nla_total_size(2) +
1386                /* IFLA_GRE_IKEY */
1387                nla_total_size(4) +
1388                /* IFLA_GRE_OKEY */
1389                nla_total_size(4) +
1390                /* IFLA_GRE_LOCAL */
1391                nla_total_size(4) +
1392                /* IFLA_GRE_REMOTE */
1393                nla_total_size(4) +
1394                /* IFLA_GRE_TTL */
1395                nla_total_size(1) +
1396                /* IFLA_GRE_TOS */
1397                nla_total_size(1) +
1398                /* IFLA_GRE_PMTUDISC */
1399                nla_total_size(1) +
1400                /* IFLA_GRE_ENCAP_TYPE */
1401                nla_total_size(2) +
1402                /* IFLA_GRE_ENCAP_FLAGS */
1403                nla_total_size(2) +
1404                /* IFLA_GRE_ENCAP_SPORT */
1405                nla_total_size(2) +
1406                /* IFLA_GRE_ENCAP_DPORT */
1407                nla_total_size(2) +
1408                /* IFLA_GRE_COLLECT_METADATA */
1409                nla_total_size(0) +
1410                /* IFLA_GRE_IGNORE_DF */
1411                nla_total_size(1) +
1412                /* IFLA_GRE_FWMARK */
1413                nla_total_size(4) +
1414                /* IFLA_GRE_ERSPAN_INDEX */
1415                nla_total_size(4) +
1416                /* IFLA_GRE_ERSPAN_VER */
1417                nla_total_size(1) +
1418                /* IFLA_GRE_ERSPAN_DIR */
1419                nla_total_size(1) +
1420                /* IFLA_GRE_ERSPAN_HWID */
1421                nla_total_size(2) +
1422                0;
1423}
1424
1425static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1426{
1427        struct ip_tunnel *t = netdev_priv(dev);
1428        struct ip_tunnel_parm *p = &t->parms;
1429
1430        if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1431            nla_put_be16(skb, IFLA_GRE_IFLAGS,
1432                         gre_tnl_flags_to_gre_flags(p->i_flags)) ||
1433            nla_put_be16(skb, IFLA_GRE_OFLAGS,
1434                         gre_tnl_flags_to_gre_flags(p->o_flags)) ||
1435            nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1436            nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1437            nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1438            nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1439            nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1440            nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1441            nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1442                       !!(p->iph.frag_off & htons(IP_DF))) ||
1443            nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark))
1444                goto nla_put_failure;
1445
1446        if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
1447                        t->encap.type) ||
1448            nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
1449                         t->encap.sport) ||
1450            nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
1451                         t->encap.dport) ||
1452            nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
1453                        t->encap.flags))
1454                goto nla_put_failure;
1455
1456        if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df))
1457                goto nla_put_failure;
1458
1459        if (t->collect_md) {
1460                if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
1461                        goto nla_put_failure;
1462        }
1463
1464        if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
1465                goto nla_put_failure;
1466
1467        if (t->erspan_ver == 1) {
1468                if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
1469                        goto nla_put_failure;
1470        } else if (t->erspan_ver == 2) {
1471                if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
1472                        goto nla_put_failure;
1473                if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
1474                        goto nla_put_failure;
1475        }
1476
1477        return 0;
1478
1479nla_put_failure:
1480        return -EMSGSIZE;
1481}
1482
1483static void erspan_setup(struct net_device *dev)
1484{
1485        ether_setup(dev);
1486        dev->netdev_ops = &erspan_netdev_ops;
1487        dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1488        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1489        ip_tunnel_setup(dev, erspan_net_id);
1490}
1491
1492static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1493        [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1494        [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1495        [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1496        [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1497        [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1498        [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1499        [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1500        [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1501        [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1502        [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1503        [IFLA_GRE_ENCAP_TYPE]   = { .type = NLA_U16 },
1504        [IFLA_GRE_ENCAP_FLAGS]  = { .type = NLA_U16 },
1505        [IFLA_GRE_ENCAP_SPORT]  = { .type = NLA_U16 },
1506        [IFLA_GRE_ENCAP_DPORT]  = { .type = NLA_U16 },
1507        [IFLA_GRE_COLLECT_METADATA]     = { .type = NLA_FLAG },
1508        [IFLA_GRE_IGNORE_DF]    = { .type = NLA_U8 },
1509        [IFLA_GRE_FWMARK]       = { .type = NLA_U32 },
1510        [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 },
1511        [IFLA_GRE_ERSPAN_VER]   = { .type = NLA_U8 },
1512        [IFLA_GRE_ERSPAN_DIR]   = { .type = NLA_U8 },
1513        [IFLA_GRE_ERSPAN_HWID]  = { .type = NLA_U16 },
1514};
1515
1516static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1517        .kind           = "gre",
1518        .maxtype        = IFLA_GRE_MAX,
1519        .policy         = ipgre_policy,
1520        .priv_size      = sizeof(struct ip_tunnel),
1521        .setup          = ipgre_tunnel_setup,
1522        .validate       = ipgre_tunnel_validate,
1523        .newlink        = ipgre_newlink,
1524        .changelink     = ipgre_changelink,
1525        .dellink        = ip_tunnel_dellink,
1526        .get_size       = ipgre_get_size,
1527        .fill_info      = ipgre_fill_info,
1528        .get_link_net   = ip_tunnel_get_link_net,
1529};
1530
1531static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1532        .kind           = "gretap",
1533        .maxtype        = IFLA_GRE_MAX,
1534        .policy         = ipgre_policy,
1535        .priv_size      = sizeof(struct ip_tunnel),
1536        .setup          = ipgre_tap_setup,
1537        .validate       = ipgre_tap_validate,
1538        .newlink        = ipgre_newlink,
1539        .changelink     = ipgre_changelink,
1540        .dellink        = ip_tunnel_dellink,
1541        .get_size       = ipgre_get_size,
1542        .fill_info      = ipgre_fill_info,
1543        .get_link_net   = ip_tunnel_get_link_net,
1544};
1545
1546static struct rtnl_link_ops erspan_link_ops __read_mostly = {
1547        .kind           = "erspan",
1548        .maxtype        = IFLA_GRE_MAX,
1549        .policy         = ipgre_policy,
1550        .priv_size      = sizeof(struct ip_tunnel),
1551        .setup          = erspan_setup,
1552        .validate       = erspan_validate,
1553        .newlink        = ipgre_newlink,
1554        .changelink     = ipgre_changelink,
1555        .dellink        = ip_tunnel_dellink,
1556        .get_size       = ipgre_get_size,
1557        .fill_info      = ipgre_fill_info,
1558        .get_link_net   = ip_tunnel_get_link_net,
1559};
1560
1561struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
1562                                        u8 name_assign_type)
1563{
1564        struct nlattr *tb[IFLA_MAX + 1];
1565        struct net_device *dev;
1566        LIST_HEAD(list_kill);
1567        struct ip_tunnel *t;
1568        int err;
1569
1570        memset(&tb, 0, sizeof(tb));
1571
1572        dev = rtnl_create_link(net, name, name_assign_type,
1573                               &ipgre_tap_ops, tb);
1574        if (IS_ERR(dev))
1575                return dev;
1576
1577        /* Configure flow based GRE device. */
1578        t = netdev_priv(dev);
1579        t->collect_md = true;
1580
1581        err = ipgre_newlink(net, dev, tb, NULL, NULL);
1582        if (err < 0) {
1583                free_netdev(dev);
1584                return ERR_PTR(err);
1585        }
1586
1587        /* openvswitch users expect packet sizes to be unrestricted,
1588         * so set the largest MTU we can.
1589         */
1590        err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
1591        if (err)
1592                goto out;
1593
1594        err = rtnl_configure_link(dev, NULL);
1595        if (err < 0)
1596                goto out;
1597
1598        return dev;
1599out:
1600        ip_tunnel_dellink(dev, &list_kill);
1601        unregister_netdevice_many(&list_kill);
1602        return ERR_PTR(err);
1603}
1604EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
1605
1606static int __net_init ipgre_tap_init_net(struct net *net)
1607{
1608        return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
1609}
1610
1611static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net)
1612{
1613        ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops);
1614}
1615
1616static struct pernet_operations ipgre_tap_net_ops = {
1617        .init = ipgre_tap_init_net,
1618        .exit_batch = ipgre_tap_exit_batch_net,
1619        .id   = &gre_tap_net_id,
1620        .size = sizeof(struct ip_tunnel_net),
1621};
1622
1623static int __net_init erspan_init_net(struct net *net)
1624{
1625        return ip_tunnel_init_net(net, erspan_net_id,
1626                                  &erspan_link_ops, "erspan0");
1627}
1628
1629static void __net_exit erspan_exit_batch_net(struct list_head *net_list)
1630{
1631        ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops);
1632}
1633
1634static struct pernet_operations erspan_net_ops = {
1635        .init = erspan_init_net,
1636        .exit_batch = erspan_exit_batch_net,
1637        .id   = &erspan_net_id,
1638        .size = sizeof(struct ip_tunnel_net),
1639};
1640
1641static int __init ipgre_init(void)
1642{
1643        int err;
1644
1645        pr_info("GRE over IPv4 tunneling driver\n");
1646
1647        err = register_pernet_device(&ipgre_net_ops);
1648        if (err < 0)
1649                return err;
1650
1651        err = register_pernet_device(&ipgre_tap_net_ops);
1652        if (err < 0)
1653                goto pnet_tap_failed;
1654
1655        err = register_pernet_device(&erspan_net_ops);
1656        if (err < 0)
1657                goto pnet_erspan_failed;
1658
1659        err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1660        if (err < 0) {
1661                pr_info("%s: can't add protocol\n", __func__);
1662                goto add_proto_failed;
1663        }
1664
1665        err = rtnl_link_register(&ipgre_link_ops);
1666        if (err < 0)
1667                goto rtnl_link_failed;
1668
1669        err = rtnl_link_register(&ipgre_tap_ops);
1670        if (err < 0)
1671                goto tap_ops_failed;
1672
1673        err = rtnl_link_register(&erspan_link_ops);
1674        if (err < 0)
1675                goto erspan_link_failed;
1676
1677        return 0;
1678
1679erspan_link_failed:
1680        rtnl_link_unregister(&ipgre_tap_ops);
1681tap_ops_failed:
1682        rtnl_link_unregister(&ipgre_link_ops);
1683rtnl_link_failed:
1684        gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1685add_proto_failed:
1686        unregister_pernet_device(&erspan_net_ops);
1687pnet_erspan_failed:
1688        unregister_pernet_device(&ipgre_tap_net_ops);
1689pnet_tap_failed:
1690        unregister_pernet_device(&ipgre_net_ops);
1691        return err;
1692}
1693
1694static void __exit ipgre_fini(void)
1695{
1696        rtnl_link_unregister(&ipgre_tap_ops);
1697        rtnl_link_unregister(&ipgre_link_ops);
1698        rtnl_link_unregister(&erspan_link_ops);
1699        gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1700        unregister_pernet_device(&ipgre_tap_net_ops);
1701        unregister_pernet_device(&ipgre_net_ops);
1702        unregister_pernet_device(&erspan_net_ops);
1703}
1704
1705module_init(ipgre_init);
1706module_exit(ipgre_fini);
1707MODULE_LICENSE("GPL");
1708MODULE_ALIAS_RTNL_LINK("gre");
1709MODULE_ALIAS_RTNL_LINK("gretap");
1710MODULE_ALIAS_RTNL_LINK("erspan");
1711MODULE_ALIAS_NETDEV("gre0");
1712MODULE_ALIAS_NETDEV("gretap0");
1713MODULE_ALIAS_NETDEV("erspan0");
1714