linux/net/ipv4/ip_gre.c
<<
>>
Prefs
   1/*
   2 *      Linux NET3:     GRE over IP protocol decoder.
   3 *
   4 *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
   5 *
   6 *      This program is free software; you can redistribute it and/or
   7 *      modify it under the terms of the GNU General Public License
   8 *      as published by the Free Software Foundation; either version
   9 *      2 of the License, or (at your option) any later version.
  10 *
  11 */
  12
  13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  14
  15#include <linux/capability.h>
  16#include <linux/module.h>
  17#include <linux/types.h>
  18#include <linux/kernel.h>
  19#include <linux/slab.h>
  20#include <linux/uaccess.h>
  21#include <linux/skbuff.h>
  22#include <linux/netdevice.h>
  23#include <linux/in.h>
  24#include <linux/tcp.h>
  25#include <linux/udp.h>
  26#include <linux/if_arp.h>
  27#include <linux/if_vlan.h>
  28#include <linux/init.h>
  29#include <linux/in6.h>
  30#include <linux/inetdevice.h>
  31#include <linux/igmp.h>
  32#include <linux/netfilter_ipv4.h>
  33#include <linux/etherdevice.h>
  34#include <linux/if_ether.h>
  35
  36#include <net/sock.h>
  37#include <net/ip.h>
  38#include <net/icmp.h>
  39#include <net/protocol.h>
  40#include <net/ip_tunnels.h>
  41#include <net/arp.h>
  42#include <net/checksum.h>
  43#include <net/dsfield.h>
  44#include <net/inet_ecn.h>
  45#include <net/xfrm.h>
  46#include <net/net_namespace.h>
  47#include <net/netns/generic.h>
  48#include <net/rtnetlink.h>
  49#include <net/gre.h>
  50#include <net/dst_metadata.h>
  51#include <net/erspan.h>
  52
  53/*
  54   Problems & solutions
  55   --------------------
  56
  57   1. The most important issue is detecting local dead loops.
  58   They would cause complete host lockup in transmit, which
  59   would be "resolved" by stack overflow or, if queueing is enabled,
  60   with infinite looping in net_bh.
  61
  62   We cannot track such dead loops during route installation,
  63   it is infeasible task. The most general solutions would be
  64   to keep skb->encapsulation counter (sort of local ttl),
  65   and silently drop packet when it expires. It is a good
  66   solution, but it supposes maintaining new variable in ALL
  67   skb, even if no tunneling is used.
  68
  69   Current solution: xmit_recursion breaks dead loops. This is a percpu
  70   counter, since when we enter the first ndo_xmit(), cpu migration is
  71   forbidden. We force an exit if this counter reaches RECURSION_LIMIT
  72
  73   2. Networking dead loops would not kill routers, but would really
  74   kill network. IP hop limit plays role of "t->recursion" in this case,
  75   if we copy it from packet being encapsulated to upper header.
  76   It is very good solution, but it introduces two problems:
  77
  78   - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
  79     do not work over tunnels.
  80   - traceroute does not work. I planned to relay ICMP from tunnel,
  81     so that this problem would be solved and traceroute output
  82     would even more informative. This idea appeared to be wrong:
  83     only Linux complies to rfc1812 now (yes, guys, Linux is the only
  84     true router now :-)), all routers (at least, in neighbourhood of mine)
  85     return only 8 bytes of payload. It is the end.
  86
  87   Hence, if we want that OSPF worked or traceroute said something reasonable,
  88   we should search for another solution.
  89
  90   One of them is to parse packet trying to detect inner encapsulation
  91   made by our node. It is difficult or even impossible, especially,
  92   taking into account fragmentation. TO be short, ttl is not solution at all.
  93
  94   Current solution: The solution was UNEXPECTEDLY SIMPLE.
  95   We force DF flag on tunnels with preconfigured hop limit,
  96   that is ALL. :-) Well, it does not remove the problem completely,
  97   but exponential growth of network traffic is changed to linear
  98   (branches, that exceed pmtu are pruned) and tunnel mtu
  99   rapidly degrades to value <68, where looping stops.
 100   Yes, it is not good if there exists a router in the loop,
 101   which does not force DF, even when encapsulating packets have DF set.
 102   But it is not our problem! Nobody could accuse us, we made
 103   all that we could make. Even if it is your gated who injected
 104   fatal route to network, even if it were you who configured
 105   fatal static route: you are innocent. :-)
 106
 107   Alexey Kuznetsov.
 108 */
 109
 110static bool log_ecn_error = true;
 111module_param(log_ecn_error, bool, 0644);
 112MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 113
 114static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 115static int ipgre_tunnel_init(struct net_device *dev);
 116static void erspan_build_header(struct sk_buff *skb,
 117                                __be32 id, u32 index, bool truncate);
 118
 119static unsigned int ipgre_net_id __read_mostly;
 120static unsigned int gre_tap_net_id __read_mostly;
 121static unsigned int erspan_net_id __read_mostly;
 122
 123static void ipgre_err(struct sk_buff *skb, u32 info,
 124                      const struct tnl_ptk_info *tpi)
 125{
 126
 127        /* All the routers (except for Linux) return only
 128           8 bytes of packet payload. It means, that precise relaying of
 129           ICMP in the real Internet is absolutely infeasible.
 130
 131           Moreover, Cisco "wise men" put GRE key to the third word
 132           in GRE header. It makes impossible maintaining even soft
 133           state for keyed GRE tunnels with enabled checksum. Tell
 134           them "thank you".
 135
 136           Well, I wonder, rfc1812 was written by Cisco employee,
 137           what the hell these idiots break standards established
 138           by themselves???
 139           */
 140        struct net *net = dev_net(skb->dev);
 141        struct ip_tunnel_net *itn;
 142        const struct iphdr *iph;
 143        const int type = icmp_hdr(skb)->type;
 144        const int code = icmp_hdr(skb)->code;
 145        unsigned int data_len = 0;
 146        struct ip_tunnel *t;
 147
 148        switch (type) {
 149        default:
 150        case ICMP_PARAMETERPROB:
 151                return;
 152
 153        case ICMP_DEST_UNREACH:
 154                switch (code) {
 155                case ICMP_SR_FAILED:
 156                case ICMP_PORT_UNREACH:
 157                        /* Impossible event. */
 158                        return;
 159                default:
 160                        /* All others are translated to HOST_UNREACH.
 161                           rfc2003 contains "deep thoughts" about NET_UNREACH,
 162                           I believe they are just ether pollution. --ANK
 163                         */
 164                        break;
 165                }
 166                break;
 167
 168        case ICMP_TIME_EXCEEDED:
 169                if (code != ICMP_EXC_TTL)
 170                        return;
 171                data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
 172                break;
 173
 174        case ICMP_REDIRECT:
 175                break;
 176        }
 177
 178        if (tpi->proto == htons(ETH_P_TEB))
 179                itn = net_generic(net, gre_tap_net_id);
 180        else
 181                itn = net_generic(net, ipgre_net_id);
 182
 183        iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
 184        t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
 185                             iph->daddr, iph->saddr, tpi->key);
 186
 187        if (!t)
 188                return;
 189
 190#if IS_ENABLED(CONFIG_IPV6)
 191       if (tpi->proto == htons(ETH_P_IPV6) &&
 192           !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
 193                                       type, data_len))
 194               return;
 195#endif
 196
 197        if (t->parms.iph.daddr == 0 ||
 198            ipv4_is_multicast(t->parms.iph.daddr))
 199                return;
 200
 201        if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
 202                return;
 203
 204        if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
 205                t->err_count++;
 206        else
 207                t->err_count = 1;
 208        t->err_time = jiffies;
 209}
 210
 211static void gre_err(struct sk_buff *skb, u32 info)
 212{
 213        /* All the routers (except for Linux) return only
 214         * 8 bytes of packet payload. It means, that precise relaying of
 215         * ICMP in the real Internet is absolutely infeasible.
 216         *
 217         * Moreover, Cisco "wise men" put GRE key to the third word
 218         * in GRE header. It makes impossible maintaining even soft
 219         * state for keyed
 220         * GRE tunnels with enabled checksum. Tell them "thank you".
 221         *
 222         * Well, I wonder, rfc1812 was written by Cisco employee,
 223         * what the hell these idiots break standards established
 224         * by themselves???
 225         */
 226
 227        const struct iphdr *iph = (struct iphdr *)skb->data;
 228        const int type = icmp_hdr(skb)->type;
 229        const int code = icmp_hdr(skb)->code;
 230        struct tnl_ptk_info tpi;
 231        bool csum_err = false;
 232
 233        if (gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP),
 234                             iph->ihl * 4) < 0) {
 235                if (!csum_err)          /* ignore csum errors. */
 236                        return;
 237        }
 238
 239        if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
 240                ipv4_update_pmtu(skb, dev_net(skb->dev), info,
 241                                 skb->dev->ifindex, 0, IPPROTO_GRE, 0);
 242                return;
 243        }
 244        if (type == ICMP_REDIRECT) {
 245                ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
 246                              IPPROTO_GRE, 0);
 247                return;
 248        }
 249
 250        ipgre_err(skb, info, &tpi);
 251}
 252
 253static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 254                      int gre_hdr_len)
 255{
 256        struct net *net = dev_net(skb->dev);
 257        struct metadata_dst *tun_dst = NULL;
 258        struct ip_tunnel_net *itn;
 259        struct ip_tunnel *tunnel;
 260        struct erspanhdr *ershdr;
 261        const struct iphdr *iph;
 262        __be32 index;
 263        int len;
 264
 265        itn = net_generic(net, erspan_net_id);
 266        len = gre_hdr_len + sizeof(*ershdr);
 267
 268        if (unlikely(!pskb_may_pull(skb, len)))
 269                return -ENOMEM;
 270
 271        iph = ip_hdr(skb);
 272        ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len);
 273
 274        /* The original GRE header does not have key field,
 275         * Use ERSPAN 10-bit session ID as key.
 276         */
 277        tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK);
 278        index = ershdr->md.index;
 279        tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
 280                                  tpi->flags | TUNNEL_KEY,
 281                                  iph->saddr, iph->daddr, tpi->key);
 282
 283        if (tunnel) {
 284                if (__iptunnel_pull_header(skb,
 285                                           gre_hdr_len + sizeof(*ershdr),
 286                                           htons(ETH_P_TEB),
 287                                           false, false) < 0)
 288                        goto drop;
 289
 290                if (tunnel->collect_md) {
 291                        struct ip_tunnel_info *info;
 292                        struct erspan_metadata *md;
 293                        __be64 tun_id;
 294                        __be16 flags;
 295
 296                        tpi->flags |= TUNNEL_KEY;
 297                        flags = tpi->flags;
 298                        tun_id = key32_to_tunnel_id(tpi->key);
 299
 300                        tun_dst = ip_tun_rx_dst(skb, flags,
 301                                                tun_id, sizeof(*md));
 302                        if (!tun_dst)
 303                                return PACKET_REJECT;
 304
 305                        md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
 306                        if (!md)
 307                                return PACKET_REJECT;
 308
 309                        md->index = index;
 310                        info = &tun_dst->u.tun_info;
 311                        info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
 312                        info->options_len = sizeof(*md);
 313                } else {
 314                        tunnel->index = ntohl(index);
 315                }
 316
 317                skb_reset_mac_header(skb);
 318                ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
 319                return PACKET_RCVD;
 320        }
 321drop:
 322        kfree_skb(skb);
 323        return PACKET_RCVD;
 324}
 325
 326static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 327                       struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
 328{
 329        struct metadata_dst *tun_dst = NULL;
 330        const struct iphdr *iph;
 331        struct ip_tunnel *tunnel;
 332
 333        iph = ip_hdr(skb);
 334        tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
 335                                  iph->saddr, iph->daddr, tpi->key);
 336
 337        if (tunnel) {
 338                if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
 339                                           raw_proto, false) < 0)
 340                        goto drop;
 341
 342                if (tunnel->dev->type != ARPHRD_NONE)
 343                        skb_pop_mac_header(skb);
 344                else
 345                        skb_reset_mac_header(skb);
 346                if (tunnel->collect_md) {
 347                        __be16 flags;
 348                        __be64 tun_id;
 349
 350                        flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
 351                        tun_id = key32_to_tunnel_id(tpi->key);
 352                        tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
 353                        if (!tun_dst)
 354                                return PACKET_REJECT;
 355                }
 356
 357                ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
 358                return PACKET_RCVD;
 359        }
 360        return PACKET_NEXT;
 361
 362drop:
 363        kfree_skb(skb);
 364        return PACKET_RCVD;
 365}
 366
 367static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 368                     int hdr_len)
 369{
 370        struct net *net = dev_net(skb->dev);
 371        struct ip_tunnel_net *itn;
 372        int res;
 373
 374        if (tpi->proto == htons(ETH_P_TEB))
 375                itn = net_generic(net, gre_tap_net_id);
 376        else
 377                itn = net_generic(net, ipgre_net_id);
 378
 379        res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
 380        if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
 381                /* ipgre tunnels in collect metadata mode should receive
 382                 * also ETH_P_TEB traffic.
 383                 */
 384                itn = net_generic(net, ipgre_net_id);
 385                res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
 386        }
 387        return res;
 388}
 389
 390static int gre_rcv(struct sk_buff *skb)
 391{
 392        struct tnl_ptk_info tpi;
 393        bool csum_err = false;
 394        int hdr_len;
 395
 396#ifdef CONFIG_NET_IPGRE_BROADCAST
 397        if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
 398                /* Looped back packet, drop it! */
 399                if (rt_is_output_route(skb_rtable(skb)))
 400                        goto drop;
 401        }
 402#endif
 403
 404        hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
 405        if (hdr_len < 0)
 406                goto drop;
 407
 408        if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) {
 409                if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
 410                        return 0;
 411        }
 412
 413        if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
 414                return 0;
 415
 416        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 417drop:
 418        kfree_skb(skb);
 419        return 0;
 420}
 421
 422static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
 423                       const struct iphdr *tnl_params,
 424                       __be16 proto)
 425{
 426        struct ip_tunnel *tunnel = netdev_priv(dev);
 427
 428        if (tunnel->parms.o_flags & TUNNEL_SEQ)
 429                tunnel->o_seqno++;
 430
 431        /* Push GRE header. */
 432        gre_build_header(skb, tunnel->tun_hlen,
 433                         tunnel->parms.o_flags, proto, tunnel->parms.o_key,
 434                         htonl(tunnel->o_seqno));
 435
 436        ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
 437}
 438
 439static int gre_handle_offloads(struct sk_buff *skb, bool csum)
 440{
 441        return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
 442}
 443
 444static struct rtable *gre_get_rt(struct sk_buff *skb,
 445                                 struct net_device *dev,
 446                                 struct flowi4 *fl,
 447                                 const struct ip_tunnel_key *key)
 448{
 449        struct net *net = dev_net(dev);
 450
 451        memset(fl, 0, sizeof(*fl));
 452        fl->daddr = key->u.ipv4.dst;
 453        fl->saddr = key->u.ipv4.src;
 454        fl->flowi4_tos = RT_TOS(key->tos);
 455        fl->flowi4_mark = skb->mark;
 456        fl->flowi4_proto = IPPROTO_GRE;
 457
 458        return ip_route_output_key(net, fl);
 459}
 460
 461static struct rtable *prepare_fb_xmit(struct sk_buff *skb,
 462                                      struct net_device *dev,
 463                                      struct flowi4 *fl,
 464                                      int tunnel_hlen)
 465{
 466        struct ip_tunnel_info *tun_info;
 467        const struct ip_tunnel_key *key;
 468        struct rtable *rt = NULL;
 469        int min_headroom;
 470        bool use_cache;
 471        int err;
 472
 473        tun_info = skb_tunnel_info(skb);
 474        key = &tun_info->key;
 475        use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
 476
 477        if (use_cache)
 478                rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl->saddr);
 479        if (!rt) {
 480                rt = gre_get_rt(skb, dev, fl, key);
 481                if (IS_ERR(rt))
 482                        goto err_free_skb;
 483                if (use_cache)
 484                        dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
 485                                          fl->saddr);
 486        }
 487
 488        min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
 489                        + tunnel_hlen + sizeof(struct iphdr);
 490        if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
 491                int head_delta = SKB_DATA_ALIGN(min_headroom -
 492                                                skb_headroom(skb) +
 493                                                16);
 494                err = pskb_expand_head(skb, max_t(int, head_delta, 0),
 495                                       0, GFP_ATOMIC);
 496                if (unlikely(err))
 497                        goto err_free_rt;
 498        }
 499        return rt;
 500
 501err_free_rt:
 502        ip_rt_put(rt);
 503err_free_skb:
 504        kfree_skb(skb);
 505        dev->stats.tx_dropped++;
 506        return NULL;
 507}
 508
 509static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 510                        __be16 proto)
 511{
 512        struct ip_tunnel_info *tun_info;
 513        const struct ip_tunnel_key *key;
 514        struct rtable *rt = NULL;
 515        struct flowi4 fl;
 516        int tunnel_hlen;
 517        __be16 df, flags;
 518
 519        tun_info = skb_tunnel_info(skb);
 520        if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
 521                     ip_tunnel_info_af(tun_info) != AF_INET))
 522                goto err_free_skb;
 523
 524        key = &tun_info->key;
 525        tunnel_hlen = gre_calc_hlen(key->tun_flags);
 526
 527        rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
 528        if (!rt)
 529                return;
 530
 531        /* Push Tunnel header. */
 532        if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
 533                goto err_free_rt;
 534
 535        flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
 536        gre_build_header(skb, tunnel_hlen, flags, proto,
 537                         tunnel_id_to_key32(tun_info->key.tun_id), 0);
 538
 539        df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
 540
 541        iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
 542                      key->tos, key->ttl, df, false);
 543        return;
 544
 545err_free_rt:
 546        ip_rt_put(rt);
 547err_free_skb:
 548        kfree_skb(skb);
 549        dev->stats.tx_dropped++;
 550}
 551
 552static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 553                           __be16 proto)
 554{
 555        struct ip_tunnel *tunnel = netdev_priv(dev);
 556        struct ip_tunnel_info *tun_info;
 557        const struct ip_tunnel_key *key;
 558        struct erspan_metadata *md;
 559        struct rtable *rt = NULL;
 560        bool truncate = false;
 561        struct flowi4 fl;
 562        int tunnel_hlen;
 563        __be16 df;
 564
 565        tun_info = skb_tunnel_info(skb);
 566        if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
 567                     ip_tunnel_info_af(tun_info) != AF_INET))
 568                goto err_free_skb;
 569
 570        key = &tun_info->key;
 571
 572        /* ERSPAN has fixed 8 byte GRE header */
 573        tunnel_hlen = 8 + sizeof(struct erspanhdr);
 574
 575        rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
 576        if (!rt)
 577                return;
 578
 579        if (gre_handle_offloads(skb, false))
 580                goto err_free_rt;
 581
 582        if (skb->len > dev->mtu) {
 583                pskb_trim(skb, dev->mtu);
 584                truncate = true;
 585        }
 586
 587        md = ip_tunnel_info_opts(tun_info);
 588        if (!md)
 589                goto err_free_rt;
 590
 591        erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
 592                            ntohl(md->index), truncate);
 593
 594        gre_build_header(skb, 8, TUNNEL_SEQ,
 595                         htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++));
 596
 597        df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
 598
 599        iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
 600                      key->tos, key->ttl, df, false);
 601        return;
 602
 603err_free_rt:
 604        ip_rt_put(rt);
 605err_free_skb:
 606        kfree_skb(skb);
 607        dev->stats.tx_dropped++;
 608}
 609
 610static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 611{
 612        struct ip_tunnel_info *info = skb_tunnel_info(skb);
 613        struct rtable *rt;
 614        struct flowi4 fl4;
 615
 616        if (ip_tunnel_info_af(info) != AF_INET)
 617                return -EINVAL;
 618
 619        rt = gre_get_rt(skb, dev, &fl4, &info->key);
 620        if (IS_ERR(rt))
 621                return PTR_ERR(rt);
 622
 623        ip_rt_put(rt);
 624        info->key.u.ipv4.src = fl4.saddr;
 625        return 0;
 626}
 627
 628static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
 629                              struct net_device *dev)
 630{
 631        struct ip_tunnel *tunnel = netdev_priv(dev);
 632        const struct iphdr *tnl_params;
 633
 634        if (tunnel->collect_md) {
 635                gre_fb_xmit(skb, dev, skb->protocol);
 636                return NETDEV_TX_OK;
 637        }
 638
 639        if (dev->header_ops) {
 640                /* Need space for new headers */
 641                if (skb_cow_head(skb, dev->needed_headroom -
 642                                      (tunnel->hlen + sizeof(struct iphdr))))
 643                        goto free_skb;
 644
 645                tnl_params = (const struct iphdr *)skb->data;
 646
 647                /* Pull skb since ip_tunnel_xmit() needs skb->data pointing
 648                 * to gre header.
 649                 */
 650                skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
 651                skb_reset_mac_header(skb);
 652        } else {
 653                if (skb_cow_head(skb, dev->needed_headroom))
 654                        goto free_skb;
 655
 656                tnl_params = &tunnel->parms.iph;
 657        }
 658
 659        if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
 660                goto free_skb;
 661
 662        __gre_xmit(skb, dev, tnl_params, skb->protocol);
 663        return NETDEV_TX_OK;
 664
 665free_skb:
 666        kfree_skb(skb);
 667        dev->stats.tx_dropped++;
 668        return NETDEV_TX_OK;
 669}
 670
 671static inline u8 tos_to_cos(u8 tos)
 672{
 673        u8 dscp, cos;
 674
 675        dscp = tos >> 2;
 676        cos = dscp >> 3;
 677        return cos;
 678}
 679
 680static void erspan_build_header(struct sk_buff *skb,
 681                                __be32 id, u32 index, bool truncate)
 682{
 683        struct iphdr *iphdr = ip_hdr(skb);
 684        struct ethhdr *eth = eth_hdr(skb);
 685        enum erspan_encap_type enc_type;
 686        struct erspanhdr *ershdr;
 687        struct qtag_prefix {
 688                __be16 eth_type;
 689                __be16 tci;
 690        } *qp;
 691        u16 vlan_tci = 0;
 692
 693        enc_type = ERSPAN_ENCAP_NOVLAN;
 694
 695        /* If mirrored packet has vlan tag, extract tci and
 696         *  perserve vlan header in the mirrored frame.
 697         */
 698        if (eth->h_proto == htons(ETH_P_8021Q)) {
 699                qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN);
 700                vlan_tci = ntohs(qp->tci);
 701                enc_type = ERSPAN_ENCAP_INFRAME;
 702        }
 703
 704        skb_push(skb, sizeof(*ershdr));
 705        ershdr = (struct erspanhdr *)skb->data;
 706        memset(ershdr, 0, sizeof(*ershdr));
 707
 708        ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) |
 709                                 (ERSPAN_VERSION << VER_OFFSET));
 710        ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) |
 711                           ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) |
 712                           (enc_type << EN_OFFSET & EN_MASK) |
 713                           ((truncate << T_OFFSET) & T_MASK));
 714        ershdr->md.index = htonl(index & INDEX_MASK);
 715}
 716
 717static netdev_tx_t erspan_xmit(struct sk_buff *skb,
 718                               struct net_device *dev)
 719{
 720        struct ip_tunnel *tunnel = netdev_priv(dev);
 721        bool truncate = false;
 722
 723        if (tunnel->collect_md) {
 724                erspan_fb_xmit(skb, dev, skb->protocol);
 725                return NETDEV_TX_OK;
 726        }
 727
 728        if (gre_handle_offloads(skb, false))
 729                goto free_skb;
 730
 731        if (skb_cow_head(skb, dev->needed_headroom))
 732                goto free_skb;
 733
 734        if (skb->len - dev->hard_header_len > dev->mtu) {
 735                pskb_trim(skb, dev->mtu);
 736                truncate = true;
 737        }
 738
 739        /* Push ERSPAN header */
 740        erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate);
 741        tunnel->parms.o_flags &= ~TUNNEL_KEY;
 742        __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
 743        return NETDEV_TX_OK;
 744
 745free_skb:
 746        kfree_skb(skb);
 747        dev->stats.tx_dropped++;
 748        return NETDEV_TX_OK;
 749}
 750
 751static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
 752                                struct net_device *dev)
 753{
 754        struct ip_tunnel *tunnel = netdev_priv(dev);
 755
 756        if (tunnel->collect_md) {
 757                gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
 758                return NETDEV_TX_OK;
 759        }
 760
 761        if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
 762                goto free_skb;
 763
 764        if (skb_cow_head(skb, dev->needed_headroom))
 765                goto free_skb;
 766
 767        __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
 768        return NETDEV_TX_OK;
 769
 770free_skb:
 771        kfree_skb(skb);
 772        dev->stats.tx_dropped++;
 773        return NETDEV_TX_OK;
 774}
 775
 776static int ipgre_tunnel_ioctl(struct net_device *dev,
 777                              struct ifreq *ifr, int cmd)
 778{
 779        int err;
 780        struct ip_tunnel_parm p;
 781
 782        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
 783                return -EFAULT;
 784        if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
 785                if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
 786                    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
 787                    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
 788                        return -EINVAL;
 789        }
 790        p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
 791        p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
 792
 793        err = ip_tunnel_ioctl(dev, &p, cmd);
 794        if (err)
 795                return err;
 796
 797        p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags);
 798        p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags);
 799
 800        if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
 801                return -EFAULT;
 802        return 0;
 803}
 804
 805/* Nice toy. Unfortunately, useless in real life :-)
 806   It allows to construct virtual multiprotocol broadcast "LAN"
 807   over the Internet, provided multicast routing is tuned.
 808
 809
 810   I have no idea was this bicycle invented before me,
 811   so that I had to set ARPHRD_IPGRE to a random value.
 812   I have an impression, that Cisco could make something similar,
 813   but this feature is apparently missing in IOS<=11.2(8).
 814
 815   I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
 816   with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
 817
 818   ping -t 255 224.66.66.66
 819
 820   If nobody answers, mbone does not work.
 821
 822   ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
 823   ip addr add 10.66.66.<somewhat>/24 dev Universe
 824   ifconfig Universe up
 825   ifconfig Universe add fe80::<Your_real_addr>/10
 826   ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
 827   ftp 10.66.66.66
 828   ...
 829   ftp fec0:6666:6666::193.233.7.65
 830   ...
 831 */
 832static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
 833                        unsigned short type,
 834                        const void *daddr, const void *saddr, unsigned int len)
 835{
 836        struct ip_tunnel *t = netdev_priv(dev);
 837        struct iphdr *iph;
 838        struct gre_base_hdr *greh;
 839
 840        iph = skb_push(skb, t->hlen + sizeof(*iph));
 841        greh = (struct gre_base_hdr *)(iph+1);
 842        greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
 843        greh->protocol = htons(type);
 844
 845        memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
 846
 847        /* Set the source hardware address. */
 848        if (saddr)
 849                memcpy(&iph->saddr, saddr, 4);
 850        if (daddr)
 851                memcpy(&iph->daddr, daddr, 4);
 852        if (iph->daddr)
 853                return t->hlen + sizeof(*iph);
 854
 855        return -(t->hlen + sizeof(*iph));
 856}
 857
 858static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
 859{
 860        const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
 861        memcpy(haddr, &iph->saddr, 4);
 862        return 4;
 863}
 864
 865static const struct header_ops ipgre_header_ops = {
 866        .create = ipgre_header,
 867        .parse  = ipgre_header_parse,
 868};
 869
 870#ifdef CONFIG_NET_IPGRE_BROADCAST
 871static int ipgre_open(struct net_device *dev)
 872{
 873        struct ip_tunnel *t = netdev_priv(dev);
 874
 875        if (ipv4_is_multicast(t->parms.iph.daddr)) {
 876                struct flowi4 fl4;
 877                struct rtable *rt;
 878
 879                rt = ip_route_output_gre(t->net, &fl4,
 880                                         t->parms.iph.daddr,
 881                                         t->parms.iph.saddr,
 882                                         t->parms.o_key,
 883                                         RT_TOS(t->parms.iph.tos),
 884                                         t->parms.link);
 885                if (IS_ERR(rt))
 886                        return -EADDRNOTAVAIL;
 887                dev = rt->dst.dev;
 888                ip_rt_put(rt);
 889                if (!__in_dev_get_rtnl(dev))
 890                        return -EADDRNOTAVAIL;
 891                t->mlink = dev->ifindex;
 892                ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
 893        }
 894        return 0;
 895}
 896
 897static int ipgre_close(struct net_device *dev)
 898{
 899        struct ip_tunnel *t = netdev_priv(dev);
 900
 901        if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
 902                struct in_device *in_dev;
 903                in_dev = inetdev_by_index(t->net, t->mlink);
 904                if (in_dev)
 905                        ip_mc_dec_group(in_dev, t->parms.iph.daddr);
 906        }
 907        return 0;
 908}
 909#endif
 910
 911static const struct net_device_ops ipgre_netdev_ops = {
 912        .ndo_init               = ipgre_tunnel_init,
 913        .ndo_uninit             = ip_tunnel_uninit,
 914#ifdef CONFIG_NET_IPGRE_BROADCAST
 915        .ndo_open               = ipgre_open,
 916        .ndo_stop               = ipgre_close,
 917#endif
 918        .ndo_start_xmit         = ipgre_xmit,
 919        .ndo_do_ioctl           = ipgre_tunnel_ioctl,
 920        .ndo_change_mtu         = ip_tunnel_change_mtu,
 921        .ndo_get_stats64        = ip_tunnel_get_stats64,
 922        .ndo_get_iflink         = ip_tunnel_get_iflink,
 923};
 924
 925#define GRE_FEATURES (NETIF_F_SG |              \
 926                      NETIF_F_FRAGLIST |        \
 927                      NETIF_F_HIGHDMA |         \
 928                      NETIF_F_HW_CSUM)
 929
 930static void ipgre_tunnel_setup(struct net_device *dev)
 931{
 932        dev->netdev_ops         = &ipgre_netdev_ops;
 933        dev->type               = ARPHRD_IPGRE;
 934        ip_tunnel_setup(dev, ipgre_net_id);
 935}
 936
 937static void __gre_tunnel_init(struct net_device *dev)
 938{
 939        struct ip_tunnel *tunnel;
 940        int t_hlen;
 941
 942        tunnel = netdev_priv(dev);
 943        tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
 944        tunnel->parms.iph.protocol = IPPROTO_GRE;
 945
 946        tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
 947
 948        t_hlen = tunnel->hlen + sizeof(struct iphdr);
 949
 950        dev->needed_headroom    = LL_MAX_HEADER + t_hlen + 4;
 951        dev->mtu                = ETH_DATA_LEN - t_hlen - 4;
 952
 953        dev->features           |= GRE_FEATURES;
 954        dev->hw_features        |= GRE_FEATURES;
 955
 956        if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
 957                /* TCP offload with GRE SEQ is not supported, nor
 958                 * can we support 2 levels of outer headers requiring
 959                 * an update.
 960                 */
 961                if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
 962                    (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
 963                        dev->features    |= NETIF_F_GSO_SOFTWARE;
 964                        dev->hw_features |= NETIF_F_GSO_SOFTWARE;
 965                }
 966
 967                /* Can use a lockless transmit, unless we generate
 968                 * output sequences
 969                 */
 970                dev->features |= NETIF_F_LLTX;
 971        }
 972}
 973
 974static int ipgre_tunnel_init(struct net_device *dev)
 975{
 976        struct ip_tunnel *tunnel = netdev_priv(dev);
 977        struct iphdr *iph = &tunnel->parms.iph;
 978
 979        __gre_tunnel_init(dev);
 980
 981        memcpy(dev->dev_addr, &iph->saddr, 4);
 982        memcpy(dev->broadcast, &iph->daddr, 4);
 983
 984        dev->flags              = IFF_NOARP;
 985        netif_keep_dst(dev);
 986        dev->addr_len           = 4;
 987
 988        if (iph->daddr && !tunnel->collect_md) {
 989#ifdef CONFIG_NET_IPGRE_BROADCAST
 990                if (ipv4_is_multicast(iph->daddr)) {
 991                        if (!iph->saddr)
 992                                return -EINVAL;
 993                        dev->flags = IFF_BROADCAST;
 994                        dev->header_ops = &ipgre_header_ops;
 995                }
 996#endif
 997        } else if (!tunnel->collect_md) {
 998                dev->header_ops = &ipgre_header_ops;
 999        }
1000
1001        return ip_tunnel_init(dev);
1002}
1003
1004static const struct gre_protocol ipgre_protocol = {
1005        .handler     = gre_rcv,
1006        .err_handler = gre_err,
1007};
1008
1009static int __net_init ipgre_init_net(struct net *net)
1010{
1011        return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
1012}
1013
1014static void __net_exit ipgre_exit_net(struct net *net)
1015{
1016        struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
1017        ip_tunnel_delete_net(itn, &ipgre_link_ops);
1018}
1019
1020static struct pernet_operations ipgre_net_ops = {
1021        .init = ipgre_init_net,
1022        .exit = ipgre_exit_net,
1023        .id   = &ipgre_net_id,
1024        .size = sizeof(struct ip_tunnel_net),
1025};
1026
1027static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
1028                                 struct netlink_ext_ack *extack)
1029{
1030        __be16 flags;
1031
1032        if (!data)
1033                return 0;
1034
1035        flags = 0;
1036        if (data[IFLA_GRE_IFLAGS])
1037                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1038        if (data[IFLA_GRE_OFLAGS])
1039                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1040        if (flags & (GRE_VERSION|GRE_ROUTING))
1041                return -EINVAL;
1042
1043        if (data[IFLA_GRE_COLLECT_METADATA] &&
1044            data[IFLA_GRE_ENCAP_TYPE] &&
1045            nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE)
1046                return -EINVAL;
1047
1048        return 0;
1049}
1050
1051static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
1052                              struct netlink_ext_ack *extack)
1053{
1054        __be32 daddr;
1055
1056        if (tb[IFLA_ADDRESS]) {
1057                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1058                        return -EINVAL;
1059                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1060                        return -EADDRNOTAVAIL;
1061        }
1062
1063        if (!data)
1064                goto out;
1065
1066        if (data[IFLA_GRE_REMOTE]) {
1067                memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1068                if (!daddr)
1069                        return -EINVAL;
1070        }
1071
1072out:
1073        return ipgre_tunnel_validate(tb, data, extack);
1074}
1075
1076static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
1077                           struct netlink_ext_ack *extack)
1078{
1079        __be16 flags = 0;
1080        int ret;
1081
1082        if (!data)
1083                return 0;
1084
1085        ret = ipgre_tap_validate(tb, data, extack);
1086        if (ret)
1087                return ret;
1088
1089        /* ERSPAN should only have GRE sequence and key flag */
1090        if (data[IFLA_GRE_OFLAGS])
1091                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1092        if (data[IFLA_GRE_IFLAGS])
1093                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1094        if (!data[IFLA_GRE_COLLECT_METADATA] &&
1095            flags != (GRE_SEQ | GRE_KEY))
1096                return -EINVAL;
1097
1098        /* ERSPAN Session ID only has 10-bit. Since we reuse
1099         * 32-bit key field as ID, check it's range.
1100         */
1101        if (data[IFLA_GRE_IKEY] &&
1102            (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
1103                return -EINVAL;
1104
1105        if (data[IFLA_GRE_OKEY] &&
1106            (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
1107                return -EINVAL;
1108
1109        return 0;
1110}
1111
1112static int ipgre_netlink_parms(struct net_device *dev,
1113                                struct nlattr *data[],
1114                                struct nlattr *tb[],
1115                                struct ip_tunnel_parm *parms,
1116                                __u32 *fwmark)
1117{
1118        struct ip_tunnel *t = netdev_priv(dev);
1119
1120        memset(parms, 0, sizeof(*parms));
1121
1122        parms->iph.protocol = IPPROTO_GRE;
1123
1124        if (!data)
1125                return 0;
1126
1127        if (data[IFLA_GRE_LINK])
1128                parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1129
1130        if (data[IFLA_GRE_IFLAGS])
1131                parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
1132
1133        if (data[IFLA_GRE_OFLAGS])
1134                parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
1135
1136        if (data[IFLA_GRE_IKEY])
1137                parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1138
1139        if (data[IFLA_GRE_OKEY])
1140                parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1141
1142        if (data[IFLA_GRE_LOCAL])
1143                parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
1144
1145        if (data[IFLA_GRE_REMOTE])
1146                parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
1147
1148        if (data[IFLA_GRE_TTL])
1149                parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1150
1151        if (data[IFLA_GRE_TOS])
1152                parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1153
1154        if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) {
1155                if (t->ignore_df)
1156                        return -EINVAL;
1157                parms->iph.frag_off = htons(IP_DF);
1158        }
1159
1160        if (data[IFLA_GRE_COLLECT_METADATA]) {
1161                t->collect_md = true;
1162                if (dev->type == ARPHRD_IPGRE)
1163                        dev->type = ARPHRD_NONE;
1164        }
1165
1166        if (data[IFLA_GRE_IGNORE_DF]) {
1167                if (nla_get_u8(data[IFLA_GRE_IGNORE_DF])
1168                  && (parms->iph.frag_off & htons(IP_DF)))
1169                        return -EINVAL;
1170                t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
1171        }
1172
1173        if (data[IFLA_GRE_FWMARK])
1174                *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
1175
1176        if (data[IFLA_GRE_ERSPAN_INDEX]) {
1177                t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
1178
1179                if (t->index & ~INDEX_MASK)
1180                        return -EINVAL;
1181        }
1182
1183        return 0;
1184}
1185
1186/* This function returns true when ENCAP attributes are present in the nl msg */
1187static bool ipgre_netlink_encap_parms(struct nlattr *data[],
1188                                      struct ip_tunnel_encap *ipencap)
1189{
1190        bool ret = false;
1191
1192        memset(ipencap, 0, sizeof(*ipencap));
1193
1194        if (!data)
1195                return ret;
1196
1197        if (data[IFLA_GRE_ENCAP_TYPE]) {
1198                ret = true;
1199                ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
1200        }
1201
1202        if (data[IFLA_GRE_ENCAP_FLAGS]) {
1203                ret = true;
1204                ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
1205        }
1206
1207        if (data[IFLA_GRE_ENCAP_SPORT]) {
1208                ret = true;
1209                ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
1210        }
1211
1212        if (data[IFLA_GRE_ENCAP_DPORT]) {
1213                ret = true;
1214                ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
1215        }
1216
1217        return ret;
1218}
1219
1220static int gre_tap_init(struct net_device *dev)
1221{
1222        __gre_tunnel_init(dev);
1223        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1224        netif_keep_dst(dev);
1225
1226        return ip_tunnel_init(dev);
1227}
1228
1229static const struct net_device_ops gre_tap_netdev_ops = {
1230        .ndo_init               = gre_tap_init,
1231        .ndo_uninit             = ip_tunnel_uninit,
1232        .ndo_start_xmit         = gre_tap_xmit,
1233        .ndo_set_mac_address    = eth_mac_addr,
1234        .ndo_validate_addr      = eth_validate_addr,
1235        .ndo_change_mtu         = ip_tunnel_change_mtu,
1236        .ndo_get_stats64        = ip_tunnel_get_stats64,
1237        .ndo_get_iflink         = ip_tunnel_get_iflink,
1238        .ndo_fill_metadata_dst  = gre_fill_metadata_dst,
1239};
1240
1241static int erspan_tunnel_init(struct net_device *dev)
1242{
1243        struct ip_tunnel *tunnel = netdev_priv(dev);
1244        int t_hlen;
1245
1246        tunnel->tun_hlen = 8;
1247        tunnel->parms.iph.protocol = IPPROTO_GRE;
1248        tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
1249                       sizeof(struct erspanhdr);
1250        t_hlen = tunnel->hlen + sizeof(struct iphdr);
1251
1252        dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
1253        dev->mtu = ETH_DATA_LEN - t_hlen - 4;
1254        dev->features           |= GRE_FEATURES;
1255        dev->hw_features        |= GRE_FEATURES;
1256        dev->priv_flags         |= IFF_LIVE_ADDR_CHANGE;
1257        netif_keep_dst(dev);
1258
1259        return ip_tunnel_init(dev);
1260}
1261
1262static const struct net_device_ops erspan_netdev_ops = {
1263        .ndo_init               = erspan_tunnel_init,
1264        .ndo_uninit             = ip_tunnel_uninit,
1265        .ndo_start_xmit         = erspan_xmit,
1266        .ndo_set_mac_address    = eth_mac_addr,
1267        .ndo_validate_addr      = eth_validate_addr,
1268        .ndo_change_mtu         = ip_tunnel_change_mtu,
1269        .ndo_get_stats64        = ip_tunnel_get_stats64,
1270        .ndo_get_iflink         = ip_tunnel_get_iflink,
1271        .ndo_fill_metadata_dst  = gre_fill_metadata_dst,
1272};
1273
1274static void ipgre_tap_setup(struct net_device *dev)
1275{
1276        ether_setup(dev);
1277        dev->netdev_ops = &gre_tap_netdev_ops;
1278        dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1279        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1280        ip_tunnel_setup(dev, gre_tap_net_id);
1281}
1282
1283static int ipgre_newlink(struct net *src_net, struct net_device *dev,
1284                         struct nlattr *tb[], struct nlattr *data[],
1285                         struct netlink_ext_ack *extack)
1286{
1287        struct ip_tunnel_parm p;
1288        struct ip_tunnel_encap ipencap;
1289        __u32 fwmark = 0;
1290        int err;
1291
1292        if (ipgre_netlink_encap_parms(data, &ipencap)) {
1293                struct ip_tunnel *t = netdev_priv(dev);
1294                err = ip_tunnel_encap_setup(t, &ipencap);
1295
1296                if (err < 0)
1297                        return err;
1298        }
1299
1300        err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1301        if (err < 0)
1302                return err;
1303        return ip_tunnel_newlink(dev, tb, &p, fwmark);
1304}
1305
1306static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1307                            struct nlattr *data[],
1308                            struct netlink_ext_ack *extack)
1309{
1310        struct ip_tunnel *t = netdev_priv(dev);
1311        struct ip_tunnel_parm p;
1312        struct ip_tunnel_encap ipencap;
1313        __u32 fwmark = t->fwmark;
1314        int err;
1315
1316        if (ipgre_netlink_encap_parms(data, &ipencap)) {
1317                err = ip_tunnel_encap_setup(t, &ipencap);
1318
1319                if (err < 0)
1320                        return err;
1321        }
1322
1323        err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1324        if (err < 0)
1325                return err;
1326        return ip_tunnel_changelink(dev, tb, &p, fwmark);
1327}
1328
1329static size_t ipgre_get_size(const struct net_device *dev)
1330{
1331        return
1332                /* IFLA_GRE_LINK */
1333                nla_total_size(4) +
1334                /* IFLA_GRE_IFLAGS */
1335                nla_total_size(2) +
1336                /* IFLA_GRE_OFLAGS */
1337                nla_total_size(2) +
1338                /* IFLA_GRE_IKEY */
1339                nla_total_size(4) +
1340                /* IFLA_GRE_OKEY */
1341                nla_total_size(4) +
1342                /* IFLA_GRE_LOCAL */
1343                nla_total_size(4) +
1344                /* IFLA_GRE_REMOTE */
1345                nla_total_size(4) +
1346                /* IFLA_GRE_TTL */
1347                nla_total_size(1) +
1348                /* IFLA_GRE_TOS */
1349                nla_total_size(1) +
1350                /* IFLA_GRE_PMTUDISC */
1351                nla_total_size(1) +
1352                /* IFLA_GRE_ENCAP_TYPE */
1353                nla_total_size(2) +
1354                /* IFLA_GRE_ENCAP_FLAGS */
1355                nla_total_size(2) +
1356                /* IFLA_GRE_ENCAP_SPORT */
1357                nla_total_size(2) +
1358                /* IFLA_GRE_ENCAP_DPORT */
1359                nla_total_size(2) +
1360                /* IFLA_GRE_COLLECT_METADATA */
1361                nla_total_size(0) +
1362                /* IFLA_GRE_IGNORE_DF */
1363                nla_total_size(1) +
1364                /* IFLA_GRE_FWMARK */
1365                nla_total_size(4) +
1366                /* IFLA_GRE_ERSPAN_INDEX */
1367                nla_total_size(4) +
1368                0;
1369}
1370
1371static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1372{
1373        struct ip_tunnel *t = netdev_priv(dev);
1374        struct ip_tunnel_parm *p = &t->parms;
1375
1376        if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1377            nla_put_be16(skb, IFLA_GRE_IFLAGS,
1378                         gre_tnl_flags_to_gre_flags(p->i_flags)) ||
1379            nla_put_be16(skb, IFLA_GRE_OFLAGS,
1380                         gre_tnl_flags_to_gre_flags(p->o_flags)) ||
1381            nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1382            nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1383            nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1384            nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1385            nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1386            nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1387            nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1388                       !!(p->iph.frag_off & htons(IP_DF))) ||
1389            nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark))
1390                goto nla_put_failure;
1391
1392        if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
1393                        t->encap.type) ||
1394            nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
1395                         t->encap.sport) ||
1396            nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
1397                         t->encap.dport) ||
1398            nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
1399                        t->encap.flags))
1400                goto nla_put_failure;
1401
1402        if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df))
1403                goto nla_put_failure;
1404
1405        if (t->collect_md) {
1406                if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
1407                        goto nla_put_failure;
1408        }
1409
1410        if (t->index)
1411                if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
1412                        goto nla_put_failure;
1413
1414        return 0;
1415
1416nla_put_failure:
1417        return -EMSGSIZE;
1418}
1419
1420static void erspan_setup(struct net_device *dev)
1421{
1422        ether_setup(dev);
1423        dev->netdev_ops = &erspan_netdev_ops;
1424        dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1425        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1426        ip_tunnel_setup(dev, erspan_net_id);
1427}
1428
1429static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1430        [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1431        [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1432        [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1433        [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1434        [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1435        [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1436        [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1437        [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1438        [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1439        [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1440        [IFLA_GRE_ENCAP_TYPE]   = { .type = NLA_U16 },
1441        [IFLA_GRE_ENCAP_FLAGS]  = { .type = NLA_U16 },
1442        [IFLA_GRE_ENCAP_SPORT]  = { .type = NLA_U16 },
1443        [IFLA_GRE_ENCAP_DPORT]  = { .type = NLA_U16 },
1444        [IFLA_GRE_COLLECT_METADATA]     = { .type = NLA_FLAG },
1445        [IFLA_GRE_IGNORE_DF]    = { .type = NLA_U8 },
1446        [IFLA_GRE_FWMARK]       = { .type = NLA_U32 },
1447        [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 },
1448};
1449
1450static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1451        .kind           = "gre",
1452        .maxtype        = IFLA_GRE_MAX,
1453        .policy         = ipgre_policy,
1454        .priv_size      = sizeof(struct ip_tunnel),
1455        .setup          = ipgre_tunnel_setup,
1456        .validate       = ipgre_tunnel_validate,
1457        .newlink        = ipgre_newlink,
1458        .changelink     = ipgre_changelink,
1459        .dellink        = ip_tunnel_dellink,
1460        .get_size       = ipgre_get_size,
1461        .fill_info      = ipgre_fill_info,
1462        .get_link_net   = ip_tunnel_get_link_net,
1463};
1464
1465static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1466        .kind           = "gretap",
1467        .maxtype        = IFLA_GRE_MAX,
1468        .policy         = ipgre_policy,
1469        .priv_size      = sizeof(struct ip_tunnel),
1470        .setup          = ipgre_tap_setup,
1471        .validate       = ipgre_tap_validate,
1472        .newlink        = ipgre_newlink,
1473        .changelink     = ipgre_changelink,
1474        .dellink        = ip_tunnel_dellink,
1475        .get_size       = ipgre_get_size,
1476        .fill_info      = ipgre_fill_info,
1477        .get_link_net   = ip_tunnel_get_link_net,
1478};
1479
1480static struct rtnl_link_ops erspan_link_ops __read_mostly = {
1481        .kind           = "erspan",
1482        .maxtype        = IFLA_GRE_MAX,
1483        .policy         = ipgre_policy,
1484        .priv_size      = sizeof(struct ip_tunnel),
1485        .setup          = erspan_setup,
1486        .validate       = erspan_validate,
1487        .newlink        = ipgre_newlink,
1488        .changelink     = ipgre_changelink,
1489        .dellink        = ip_tunnel_dellink,
1490        .get_size       = ipgre_get_size,
1491        .fill_info      = ipgre_fill_info,
1492        .get_link_net   = ip_tunnel_get_link_net,
1493};
1494
1495struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
1496                                        u8 name_assign_type)
1497{
1498        struct nlattr *tb[IFLA_MAX + 1];
1499        struct net_device *dev;
1500        LIST_HEAD(list_kill);
1501        struct ip_tunnel *t;
1502        int err;
1503
1504        memset(&tb, 0, sizeof(tb));
1505
1506        dev = rtnl_create_link(net, name, name_assign_type,
1507                               &ipgre_tap_ops, tb);
1508        if (IS_ERR(dev))
1509                return dev;
1510
1511        /* Configure flow based GRE device. */
1512        t = netdev_priv(dev);
1513        t->collect_md = true;
1514
1515        err = ipgre_newlink(net, dev, tb, NULL, NULL);
1516        if (err < 0) {
1517                free_netdev(dev);
1518                return ERR_PTR(err);
1519        }
1520
1521        /* openvswitch users expect packet sizes to be unrestricted,
1522         * so set the largest MTU we can.
1523         */
1524        err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
1525        if (err)
1526                goto out;
1527
1528        err = rtnl_configure_link(dev, NULL);
1529        if (err < 0)
1530                goto out;
1531
1532        return dev;
1533out:
1534        ip_tunnel_dellink(dev, &list_kill);
1535        unregister_netdevice_many(&list_kill);
1536        return ERR_PTR(err);
1537}
1538EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
1539
1540static int __net_init ipgre_tap_init_net(struct net *net)
1541{
1542        return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
1543}
1544
1545static void __net_exit ipgre_tap_exit_net(struct net *net)
1546{
1547        struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
1548        ip_tunnel_delete_net(itn, &ipgre_tap_ops);
1549}
1550
1551static struct pernet_operations ipgre_tap_net_ops = {
1552        .init = ipgre_tap_init_net,
1553        .exit = ipgre_tap_exit_net,
1554        .id   = &gre_tap_net_id,
1555        .size = sizeof(struct ip_tunnel_net),
1556};
1557
1558static int __net_init erspan_init_net(struct net *net)
1559{
1560        return ip_tunnel_init_net(net, erspan_net_id,
1561                                  &erspan_link_ops, "erspan0");
1562}
1563
1564static void __net_exit erspan_exit_net(struct net *net)
1565{
1566        struct ip_tunnel_net *itn = net_generic(net, erspan_net_id);
1567
1568        ip_tunnel_delete_net(itn, &erspan_link_ops);
1569}
1570
1571static struct pernet_operations erspan_net_ops = {
1572        .init = erspan_init_net,
1573        .exit = erspan_exit_net,
1574        .id   = &erspan_net_id,
1575        .size = sizeof(struct ip_tunnel_net),
1576};
1577
1578static int __init ipgre_init(void)
1579{
1580        int err;
1581
1582        pr_info("GRE over IPv4 tunneling driver\n");
1583
1584        err = register_pernet_device(&ipgre_net_ops);
1585        if (err < 0)
1586                return err;
1587
1588        err = register_pernet_device(&ipgre_tap_net_ops);
1589        if (err < 0)
1590                goto pnet_tap_failed;
1591
1592        err = register_pernet_device(&erspan_net_ops);
1593        if (err < 0)
1594                goto pnet_erspan_failed;
1595
1596        err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1597        if (err < 0) {
1598                pr_info("%s: can't add protocol\n", __func__);
1599                goto add_proto_failed;
1600        }
1601
1602        err = rtnl_link_register(&ipgre_link_ops);
1603        if (err < 0)
1604                goto rtnl_link_failed;
1605
1606        err = rtnl_link_register(&ipgre_tap_ops);
1607        if (err < 0)
1608                goto tap_ops_failed;
1609
1610        err = rtnl_link_register(&erspan_link_ops);
1611        if (err < 0)
1612                goto erspan_link_failed;
1613
1614        return 0;
1615
1616erspan_link_failed:
1617        rtnl_link_unregister(&ipgre_tap_ops);
1618tap_ops_failed:
1619        rtnl_link_unregister(&ipgre_link_ops);
1620rtnl_link_failed:
1621        gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1622add_proto_failed:
1623        unregister_pernet_device(&erspan_net_ops);
1624pnet_erspan_failed:
1625        unregister_pernet_device(&ipgre_tap_net_ops);
1626pnet_tap_failed:
1627        unregister_pernet_device(&ipgre_net_ops);
1628        return err;
1629}
1630
1631static void __exit ipgre_fini(void)
1632{
1633        rtnl_link_unregister(&ipgre_tap_ops);
1634        rtnl_link_unregister(&ipgre_link_ops);
1635        rtnl_link_unregister(&erspan_link_ops);
1636        gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1637        unregister_pernet_device(&ipgre_tap_net_ops);
1638        unregister_pernet_device(&ipgre_net_ops);
1639        unregister_pernet_device(&erspan_net_ops);
1640}
1641
1642module_init(ipgre_init);
1643module_exit(ipgre_fini);
1644MODULE_LICENSE("GPL");
1645MODULE_ALIAS_RTNL_LINK("gre");
1646MODULE_ALIAS_RTNL_LINK("gretap");
1647MODULE_ALIAS_RTNL_LINK("erspan");
1648MODULE_ALIAS_NETDEV("gre0");
1649MODULE_ALIAS_NETDEV("gretap0");
1650MODULE_ALIAS_NETDEV("erspan0");
1651