linux/net/ipv4/ip_gre.c
<<
>>
Prefs
   1/*
   2 *      Linux NET3:     GRE over IP protocol decoder.
   3 *
   4 *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
   5 *
   6 *      This program is free software; you can redistribute it and/or
   7 *      modify it under the terms of the GNU General Public License
   8 *      as published by the Free Software Foundation; either version
   9 *      2 of the License, or (at your option) any later version.
  10 *
  11 */
  12
  13#include <linux/capability.h>
  14#include <linux/module.h>
  15#include <linux/types.h>
  16#include <linux/kernel.h>
  17#include <linux/slab.h>
  18#include <asm/uaccess.h>
  19#include <linux/skbuff.h>
  20#include <linux/netdevice.h>
  21#include <linux/in.h>
  22#include <linux/tcp.h>
  23#include <linux/udp.h>
  24#include <linux/if_arp.h>
  25#include <linux/mroute.h>
  26#include <linux/init.h>
  27#include <linux/in6.h>
  28#include <linux/inetdevice.h>
  29#include <linux/igmp.h>
  30#include <linux/netfilter_ipv4.h>
  31#include <linux/etherdevice.h>
  32#include <linux/if_ether.h>
  33
  34#include <net/sock.h>
  35#include <net/ip.h>
  36#include <net/icmp.h>
  37#include <net/protocol.h>
  38#include <net/ipip.h>
  39#include <net/arp.h>
  40#include <net/checksum.h>
  41#include <net/dsfield.h>
  42#include <net/inet_ecn.h>
  43#include <net/xfrm.h>
  44#include <net/net_namespace.h>
  45#include <net/netns/generic.h>
  46#include <net/rtnetlink.h>
  47#include <net/gre.h>
  48
  49#if IS_ENABLED(CONFIG_IPV6)
  50#include <net/ipv6.h>
  51#include <net/ip6_fib.h>
  52#include <net/ip6_route.h>
  53#endif
  54
  55/*
  56   Problems & solutions
  57   --------------------
  58
  59   1. The most important issue is detecting local dead loops.
  60   They would cause complete host lockup in transmit, which
  61   would be "resolved" by stack overflow or, if queueing is enabled,
  62   with infinite looping in net_bh.
  63
  64   We cannot track such dead loops during route installation,
  65   it is infeasible task. The most general solutions would be
  66   to keep skb->encapsulation counter (sort of local ttl),
  67   and silently drop packet when it expires. It is a good
  68   solution, but it supposes maintaining new variable in ALL
  69   skb, even if no tunneling is used.
  70
  71   Current solution: xmit_recursion breaks dead loops. This is a percpu
  72   counter, since when we enter the first ndo_xmit(), cpu migration is
  73   forbidden. We force an exit if this counter reaches RECURSION_LIMIT
  74
  75   2. Networking dead loops would not kill routers, but would really
  76   kill network. IP hop limit plays role of "t->recursion" in this case,
  77   if we copy it from packet being encapsulated to upper header.
  78   It is very good solution, but it introduces two problems:
  79
  80   - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
  81     do not work over tunnels.
  82   - traceroute does not work. I planned to relay ICMP from tunnel,
  83     so that this problem would be solved and traceroute output
  84     would even more informative. This idea appeared to be wrong:
  85     only Linux complies to rfc1812 now (yes, guys, Linux is the only
  86     true router now :-)), all routers (at least, in neighbourhood of mine)
  87     return only 8 bytes of payload. It is the end.
  88
  89   Hence, if we want that OSPF worked or traceroute said something reasonable,
  90   we should search for another solution.
  91
  92   One of them is to parse packet trying to detect inner encapsulation
  93   made by our node. It is difficult or even impossible, especially,
  94   taking into account fragmentation. TO be short, ttl is not solution at all.
  95
  96   Current solution: The solution was UNEXPECTEDLY SIMPLE.
  97   We force DF flag on tunnels with preconfigured hop limit,
  98   that is ALL. :-) Well, it does not remove the problem completely,
  99   but exponential growth of network traffic is changed to linear
 100   (branches, that exceed pmtu are pruned) and tunnel mtu
 101   rapidly degrades to value <68, where looping stops.
 102   Yes, it is not good if there exists a router in the loop,
 103   which does not force DF, even when encapsulating packets have DF set.
 104   But it is not our problem! Nobody could accuse us, we made
 105   all that we could make. Even if it is your gated who injected
 106   fatal route to network, even if it were you who configured
 107   fatal static route: you are innocent. :-)
 108
 109
 110
 111   3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
 112   practically identical code. It would be good to glue them
 113   together, but it is not very evident, how to make them modular.
 114   sit is integral part of IPv6, ipip and gre are naturally modular.
 115   We could extract common parts (hash table, ioctl etc)
 116   to a separate module (ip_tunnel.c).
 117
 118   Alexey Kuznetsov.
 119 */
 120
 121static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 122static int ipgre_tunnel_init(struct net_device *dev);
 123static void ipgre_tunnel_setup(struct net_device *dev);
 124static int ipgre_tunnel_bind_dev(struct net_device *dev);
 125
 126/* Fallback tunnel: no source, no destination, no key, no options */
 127
 128#define HASH_SIZE  16
 129
 130static int ipgre_net_id __read_mostly;
 131struct ipgre_net {
 132        struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
 133
 134        struct net_device *fb_tunnel_dev;
 135};
 136
 137/* Tunnel hash table */
 138
 139/*
 140   4 hash tables:
 141
 142   3: (remote,local)
 143   2: (remote,*)
 144   1: (*,local)
 145   0: (*,*)
 146
 147   We require exact key match i.e. if a key is present in packet
 148   it will match only tunnel with the same key; if it is not present,
 149   it will match only keyless tunnel.
 150
 151   All keysless packets, if not matched configured keyless tunnels
 152   will match fallback tunnel.
 153 */
 154
 155#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
 156
 157#define tunnels_r_l     tunnels[3]
 158#define tunnels_r       tunnels[2]
 159#define tunnels_l       tunnels[1]
 160#define tunnels_wc      tunnels[0]
 161/*
 162 * Locking : hash tables are protected by RCU and RTNL
 163 */
 164
 165#define for_each_ip_tunnel_rcu(start) \
 166        for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
 167
 168/* often modified stats are per cpu, other are shared (netdev->stats) */
 169struct pcpu_tstats {
 170        unsigned long   rx_packets;
 171        unsigned long   rx_bytes;
 172        unsigned long   tx_packets;
 173        unsigned long   tx_bytes;
 174} __attribute__((aligned(4*sizeof(unsigned long))));
 175
 176static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
 177{
 178        struct pcpu_tstats sum = { 0 };
 179        int i;
 180
 181        for_each_possible_cpu(i) {
 182                const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
 183
 184                sum.rx_packets += tstats->rx_packets;
 185                sum.rx_bytes   += tstats->rx_bytes;
 186                sum.tx_packets += tstats->tx_packets;
 187                sum.tx_bytes   += tstats->tx_bytes;
 188        }
 189        dev->stats.rx_packets = sum.rx_packets;
 190        dev->stats.rx_bytes   = sum.rx_bytes;
 191        dev->stats.tx_packets = sum.tx_packets;
 192        dev->stats.tx_bytes   = sum.tx_bytes;
 193        return &dev->stats;
 194}
 195
 196/* Given src, dst and key, find appropriate for input tunnel. */
 197
 198static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
 199                                              __be32 remote, __be32 local,
 200                                              __be32 key, __be16 gre_proto)
 201{
 202        struct net *net = dev_net(dev);
 203        int link = dev->ifindex;
 204        unsigned int h0 = HASH(remote);
 205        unsigned int h1 = HASH(key);
 206        struct ip_tunnel *t, *cand = NULL;
 207        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 208        int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
 209                       ARPHRD_ETHER : ARPHRD_IPGRE;
 210        int score, cand_score = 4;
 211
 212        for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
 213                if (local != t->parms.iph.saddr ||
 214                    remote != t->parms.iph.daddr ||
 215                    key != t->parms.i_key ||
 216                    !(t->dev->flags & IFF_UP))
 217                        continue;
 218
 219                if (t->dev->type != ARPHRD_IPGRE &&
 220                    t->dev->type != dev_type)
 221                        continue;
 222
 223                score = 0;
 224                if (t->parms.link != link)
 225                        score |= 1;
 226                if (t->dev->type != dev_type)
 227                        score |= 2;
 228                if (score == 0)
 229                        return t;
 230
 231                if (score < cand_score) {
 232                        cand = t;
 233                        cand_score = score;
 234                }
 235        }
 236
 237        for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
 238                if (remote != t->parms.iph.daddr ||
 239                    key != t->parms.i_key ||
 240                    !(t->dev->flags & IFF_UP))
 241                        continue;
 242
 243                if (t->dev->type != ARPHRD_IPGRE &&
 244                    t->dev->type != dev_type)
 245                        continue;
 246
 247                score = 0;
 248                if (t->parms.link != link)
 249                        score |= 1;
 250                if (t->dev->type != dev_type)
 251                        score |= 2;
 252                if (score == 0)
 253                        return t;
 254
 255                if (score < cand_score) {
 256                        cand = t;
 257                        cand_score = score;
 258                }
 259        }
 260
 261        for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
 262                if ((local != t->parms.iph.saddr &&
 263                     (local != t->parms.iph.daddr ||
 264                      !ipv4_is_multicast(local))) ||
 265                    key != t->parms.i_key ||
 266                    !(t->dev->flags & IFF_UP))
 267                        continue;
 268
 269                if (t->dev->type != ARPHRD_IPGRE &&
 270                    t->dev->type != dev_type)
 271                        continue;
 272
 273                score = 0;
 274                if (t->parms.link != link)
 275                        score |= 1;
 276                if (t->dev->type != dev_type)
 277                        score |= 2;
 278                if (score == 0)
 279                        return t;
 280
 281                if (score < cand_score) {
 282                        cand = t;
 283                        cand_score = score;
 284                }
 285        }
 286
 287        for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
 288                if (t->parms.i_key != key ||
 289                    !(t->dev->flags & IFF_UP))
 290                        continue;
 291
 292                if (t->dev->type != ARPHRD_IPGRE &&
 293                    t->dev->type != dev_type)
 294                        continue;
 295
 296                score = 0;
 297                if (t->parms.link != link)
 298                        score |= 1;
 299                if (t->dev->type != dev_type)
 300                        score |= 2;
 301                if (score == 0)
 302                        return t;
 303
 304                if (score < cand_score) {
 305                        cand = t;
 306                        cand_score = score;
 307                }
 308        }
 309
 310        if (cand != NULL)
 311                return cand;
 312
 313        dev = ign->fb_tunnel_dev;
 314        if (dev->flags & IFF_UP)
 315                return netdev_priv(dev);
 316
 317        return NULL;
 318}
 319
 320static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
 321                struct ip_tunnel_parm *parms)
 322{
 323        __be32 remote = parms->iph.daddr;
 324        __be32 local = parms->iph.saddr;
 325        __be32 key = parms->i_key;
 326        unsigned int h = HASH(key);
 327        int prio = 0;
 328
 329        if (local)
 330                prio |= 1;
 331        if (remote && !ipv4_is_multicast(remote)) {
 332                prio |= 2;
 333                h ^= HASH(remote);
 334        }
 335
 336        return &ign->tunnels[prio][h];
 337}
 338
 339static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
 340                struct ip_tunnel *t)
 341{
 342        return __ipgre_bucket(ign, &t->parms);
 343}
 344
 345static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
 346{
 347        struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
 348
 349        rcu_assign_pointer(t->next, rtnl_dereference(*tp));
 350        rcu_assign_pointer(*tp, t);
 351}
 352
 353static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
 354{
 355        struct ip_tunnel __rcu **tp;
 356        struct ip_tunnel *iter;
 357
 358        for (tp = ipgre_bucket(ign, t);
 359             (iter = rtnl_dereference(*tp)) != NULL;
 360             tp = &iter->next) {
 361                if (t == iter) {
 362                        rcu_assign_pointer(*tp, t->next);
 363                        break;
 364                }
 365        }
 366}
 367
 368static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
 369                                           struct ip_tunnel_parm *parms,
 370                                           int type)
 371{
 372        __be32 remote = parms->iph.daddr;
 373        __be32 local = parms->iph.saddr;
 374        __be32 key = parms->i_key;
 375        int link = parms->link;
 376        struct ip_tunnel *t;
 377        struct ip_tunnel __rcu **tp;
 378        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 379
 380        for (tp = __ipgre_bucket(ign, parms);
 381             (t = rtnl_dereference(*tp)) != NULL;
 382             tp = &t->next)
 383                if (local == t->parms.iph.saddr &&
 384                    remote == t->parms.iph.daddr &&
 385                    key == t->parms.i_key &&
 386                    link == t->parms.link &&
 387                    type == t->dev->type)
 388                        break;
 389
 390        return t;
 391}
 392
 393static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
 394                struct ip_tunnel_parm *parms, int create)
 395{
 396        struct ip_tunnel *t, *nt;
 397        struct net_device *dev;
 398        char name[IFNAMSIZ];
 399        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 400
 401        t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
 402        if (t || !create)
 403                return t;
 404
 405        if (parms->name[0])
 406                strlcpy(name, parms->name, IFNAMSIZ);
 407        else
 408                strcpy(name, "gre%d");
 409
 410        dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
 411        if (!dev)
 412                return NULL;
 413
 414        dev_net_set(dev, net);
 415
 416        nt = netdev_priv(dev);
 417        nt->parms = *parms;
 418        dev->rtnl_link_ops = &ipgre_link_ops;
 419
 420        dev->mtu = ipgre_tunnel_bind_dev(dev);
 421
 422        if (register_netdevice(dev) < 0)
 423                goto failed_free;
 424
 425        /* Can use a lockless transmit, unless we generate output sequences */
 426        if (!(nt->parms.o_flags & GRE_SEQ))
 427                dev->features |= NETIF_F_LLTX;
 428
 429        dev_hold(dev);
 430        ipgre_tunnel_link(ign, nt);
 431        return nt;
 432
 433failed_free:
 434        free_netdev(dev);
 435        return NULL;
 436}
 437
 438static void ipgre_tunnel_uninit(struct net_device *dev)
 439{
 440        struct net *net = dev_net(dev);
 441        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 442
 443        ipgre_tunnel_unlink(ign, netdev_priv(dev));
 444        dev_put(dev);
 445}
 446
 447
 448static void ipgre_err(struct sk_buff *skb, u32 info)
 449{
 450
 451/* All the routers (except for Linux) return only
 452   8 bytes of packet payload. It means, that precise relaying of
 453   ICMP in the real Internet is absolutely infeasible.
 454
 455   Moreover, Cisco "wise men" put GRE key to the third word
 456   in GRE header. It makes impossible maintaining even soft state for keyed
 457   GRE tunnels with enabled checksum. Tell them "thank you".
 458
 459   Well, I wonder, rfc1812 was written by Cisco employee,
 460   what the hell these idiots break standards established
 461   by themselves???
 462 */
 463
 464        const struct iphdr *iph = (const struct iphdr *)skb->data;
 465        __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
 466        int grehlen = (iph->ihl<<2) + 4;
 467        const int type = icmp_hdr(skb)->type;
 468        const int code = icmp_hdr(skb)->code;
 469        struct ip_tunnel *t;
 470        __be16 flags;
 471
 472        flags = p[0];
 473        if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
 474                if (flags&(GRE_VERSION|GRE_ROUTING))
 475                        return;
 476                if (flags&GRE_KEY) {
 477                        grehlen += 4;
 478                        if (flags&GRE_CSUM)
 479                                grehlen += 4;
 480                }
 481        }
 482
 483        /* If only 8 bytes returned, keyed message will be dropped here */
 484        if (skb_headlen(skb) < grehlen)
 485                return;
 486
 487        switch (type) {
 488        default:
 489        case ICMP_PARAMETERPROB:
 490                return;
 491
 492        case ICMP_DEST_UNREACH:
 493                switch (code) {
 494                case ICMP_SR_FAILED:
 495                case ICMP_PORT_UNREACH:
 496                        /* Impossible event. */
 497                        return;
 498                case ICMP_FRAG_NEEDED:
 499                        /* Soft state for pmtu is maintained by IP core. */
 500                        return;
 501                default:
 502                        /* All others are translated to HOST_UNREACH.
 503                           rfc2003 contains "deep thoughts" about NET_UNREACH,
 504                           I believe they are just ether pollution. --ANK
 505                         */
 506                        break;
 507                }
 508                break;
 509        case ICMP_TIME_EXCEEDED:
 510                if (code != ICMP_EXC_TTL)
 511                        return;
 512                break;
 513        }
 514
 515        rcu_read_lock();
 516        t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
 517                                flags & GRE_KEY ?
 518                                *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
 519                                p[1]);
 520        if (t == NULL || t->parms.iph.daddr == 0 ||
 521            ipv4_is_multicast(t->parms.iph.daddr))
 522                goto out;
 523
 524        if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
 525                goto out;
 526
 527        if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
 528                t->err_count++;
 529        else
 530                t->err_count = 1;
 531        t->err_time = jiffies;
 532out:
 533        rcu_read_unlock();
 534}
 535
 536static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
 537{
 538        if (INET_ECN_is_ce(iph->tos)) {
 539                if (skb->protocol == htons(ETH_P_IP)) {
 540                        IP_ECN_set_ce(ip_hdr(skb));
 541                } else if (skb->protocol == htons(ETH_P_IPV6)) {
 542                        IP6_ECN_set_ce(ipv6_hdr(skb));
 543                }
 544        }
 545}
 546
 547static inline u8
 548ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
 549{
 550        u8 inner = 0;
 551        if (skb->protocol == htons(ETH_P_IP))
 552                inner = old_iph->tos;
 553        else if (skb->protocol == htons(ETH_P_IPV6))
 554                inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
 555        return INET_ECN_encapsulate(tos, inner);
 556}
 557
 558static int ipgre_rcv(struct sk_buff *skb)
 559{
 560        const struct iphdr *iph;
 561        u8     *h;
 562        __be16    flags;
 563        __sum16   csum = 0;
 564        __be32 key = 0;
 565        u32    seqno = 0;
 566        struct ip_tunnel *tunnel;
 567        int    offset = 4;
 568        __be16 gre_proto;
 569
 570        if (!pskb_may_pull(skb, 16))
 571                goto drop_nolock;
 572
 573        iph = ip_hdr(skb);
 574        h = skb->data;
 575        flags = *(__be16*)h;
 576
 577        if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
 578                /* - Version must be 0.
 579                   - We do not support routing headers.
 580                 */
 581                if (flags&(GRE_VERSION|GRE_ROUTING))
 582                        goto drop_nolock;
 583
 584                if (flags&GRE_CSUM) {
 585                        switch (skb->ip_summed) {
 586                        case CHECKSUM_COMPLETE:
 587                                csum = csum_fold(skb->csum);
 588                                if (!csum)
 589                                        break;
 590                                /* fall through */
 591                        case CHECKSUM_NONE:
 592                                skb->csum = 0;
 593                                csum = __skb_checksum_complete(skb);
 594                                skb->ip_summed = CHECKSUM_COMPLETE;
 595                        }
 596                        offset += 4;
 597                }
 598                if (flags&GRE_KEY) {
 599                        key = *(__be32*)(h + offset);
 600                        offset += 4;
 601                }
 602                if (flags&GRE_SEQ) {
 603                        seqno = ntohl(*(__be32*)(h + offset));
 604                        offset += 4;
 605                }
 606        }
 607
 608        gre_proto = *(__be16 *)(h + 2);
 609
 610        rcu_read_lock();
 611        if ((tunnel = ipgre_tunnel_lookup(skb->dev,
 612                                          iph->saddr, iph->daddr, key,
 613                                          gre_proto))) {
 614                struct pcpu_tstats *tstats;
 615
 616                secpath_reset(skb);
 617
 618                skb->protocol = gre_proto;
 619                /* WCCP version 1 and 2 protocol decoding.
 620                 * - Change protocol to IP
 621                 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
 622                 */
 623                if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
 624                        skb->protocol = htons(ETH_P_IP);
 625                        if ((*(h + offset) & 0xF0) != 0x40)
 626                                offset += 4;
 627                }
 628
 629                skb->mac_header = skb->network_header;
 630                __pskb_pull(skb, offset);
 631                skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
 632                skb->pkt_type = PACKET_HOST;
 633#ifdef CONFIG_NET_IPGRE_BROADCAST
 634                if (ipv4_is_multicast(iph->daddr)) {
 635                        /* Looped back packet, drop it! */
 636                        if (rt_is_output_route(skb_rtable(skb)))
 637                                goto drop;
 638                        tunnel->dev->stats.multicast++;
 639                        skb->pkt_type = PACKET_BROADCAST;
 640                }
 641#endif
 642
 643                if (((flags&GRE_CSUM) && csum) ||
 644                    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
 645                        tunnel->dev->stats.rx_crc_errors++;
 646                        tunnel->dev->stats.rx_errors++;
 647                        goto drop;
 648                }
 649                if (tunnel->parms.i_flags&GRE_SEQ) {
 650                        if (!(flags&GRE_SEQ) ||
 651                            (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
 652                                tunnel->dev->stats.rx_fifo_errors++;
 653                                tunnel->dev->stats.rx_errors++;
 654                                goto drop;
 655                        }
 656                        tunnel->i_seqno = seqno + 1;
 657                }
 658
 659                /* Warning: All skb pointers will be invalidated! */
 660                if (tunnel->dev->type == ARPHRD_ETHER) {
 661                        if (!pskb_may_pull(skb, ETH_HLEN)) {
 662                                tunnel->dev->stats.rx_length_errors++;
 663                                tunnel->dev->stats.rx_errors++;
 664                                goto drop;
 665                        }
 666
 667                        iph = ip_hdr(skb);
 668                        skb->protocol = eth_type_trans(skb, tunnel->dev);
 669                        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 670                }
 671
 672                tstats = this_cpu_ptr(tunnel->dev->tstats);
 673                tstats->rx_packets++;
 674                tstats->rx_bytes += skb->len;
 675
 676                __skb_tunnel_rx(skb, tunnel->dev);
 677
 678                skb_reset_network_header(skb);
 679                ipgre_ecn_decapsulate(iph, skb);
 680
 681                netif_rx(skb);
 682
 683                rcu_read_unlock();
 684                return 0;
 685        }
 686        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 687
 688drop:
 689        rcu_read_unlock();
 690drop_nolock:
 691        kfree_skb(skb);
 692        return 0;
 693}
 694
 695static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 696{
 697        struct ip_tunnel *tunnel = netdev_priv(dev);
 698        struct pcpu_tstats *tstats;
 699        const struct iphdr  *old_iph = ip_hdr(skb);
 700        const struct iphdr  *tiph;
 701        struct flowi4 fl4;
 702        u8     tos;
 703        __be16 df;
 704        struct rtable *rt;                      /* Route to the other host */
 705        struct net_device *tdev;                /* Device to other host */
 706        struct iphdr  *iph;                     /* Our new IP header */
 707        unsigned int max_headroom;              /* The extra header space needed */
 708        int    gre_hlen;
 709        __be32 dst;
 710        int    mtu;
 711
 712        if (dev->type == ARPHRD_ETHER)
 713                IPCB(skb)->flags = 0;
 714
 715        if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
 716                gre_hlen = 0;
 717                tiph = (const struct iphdr *)skb->data;
 718        } else {
 719                gre_hlen = tunnel->hlen;
 720                tiph = &tunnel->parms.iph;
 721        }
 722
 723        if ((dst = tiph->daddr) == 0) {
 724                /* NBMA tunnel */
 725
 726                if (skb_dst(skb) == NULL) {
 727                        dev->stats.tx_fifo_errors++;
 728                        goto tx_error;
 729                }
 730
 731                if (skb->protocol == htons(ETH_P_IP)) {
 732                        rt = skb_rtable(skb);
 733                        if ((dst = rt->rt_gateway) == 0)
 734                                goto tx_error_icmp;
 735                }
 736#if IS_ENABLED(CONFIG_IPV6)
 737                else if (skb->protocol == htons(ETH_P_IPV6)) {
 738                        struct neighbour *neigh = dst_get_neighbour_noref(skb_dst(skb));
 739                        const struct in6_addr *addr6;
 740                        int addr_type;
 741
 742                        if (neigh == NULL)
 743                                goto tx_error;
 744
 745                        addr6 = (const struct in6_addr *)&neigh->primary_key;
 746                        addr_type = ipv6_addr_type(addr6);
 747
 748                        if (addr_type == IPV6_ADDR_ANY) {
 749                                addr6 = &ipv6_hdr(skb)->daddr;
 750                                addr_type = ipv6_addr_type(addr6);
 751                        }
 752
 753                        if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
 754                                goto tx_error_icmp;
 755
 756                        dst = addr6->s6_addr32[3];
 757                }
 758#endif
 759                else
 760                        goto tx_error;
 761        }
 762
 763        tos = tiph->tos;
 764        if (tos == 1) {
 765                tos = 0;
 766                if (skb->protocol == htons(ETH_P_IP))
 767                        tos = old_iph->tos;
 768                else if (skb->protocol == htons(ETH_P_IPV6))
 769                        tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
 770        }
 771
 772        rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
 773                                 tunnel->parms.o_key, RT_TOS(tos),
 774                                 tunnel->parms.link);
 775        if (IS_ERR(rt)) {
 776                dev->stats.tx_carrier_errors++;
 777                goto tx_error;
 778        }
 779        tdev = rt->dst.dev;
 780
 781        if (tdev == dev) {
 782                ip_rt_put(rt);
 783                dev->stats.collisions++;
 784                goto tx_error;
 785        }
 786
 787        df = tiph->frag_off;
 788        if (df)
 789                mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
 790        else
 791                mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 792
 793        if (skb_dst(skb))
 794                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 795
 796        if (skb->protocol == htons(ETH_P_IP)) {
 797                df |= (old_iph->frag_off&htons(IP_DF));
 798
 799                if ((old_iph->frag_off&htons(IP_DF)) &&
 800                    mtu < ntohs(old_iph->tot_len)) {
 801                        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 802                        ip_rt_put(rt);
 803                        goto tx_error;
 804                }
 805        }
 806#if IS_ENABLED(CONFIG_IPV6)
 807        else if (skb->protocol == htons(ETH_P_IPV6)) {
 808                struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
 809
 810                if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
 811                        if ((tunnel->parms.iph.daddr &&
 812                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
 813                            rt6->rt6i_dst.plen == 128) {
 814                                rt6->rt6i_flags |= RTF_MODIFIED;
 815                                dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
 816                        }
 817                }
 818
 819                if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
 820                        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 821                        ip_rt_put(rt);
 822                        goto tx_error;
 823                }
 824        }
 825#endif
 826
 827        if (tunnel->err_count > 0) {
 828                if (time_before(jiffies,
 829                                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
 830                        tunnel->err_count--;
 831
 832                        dst_link_failure(skb);
 833                } else
 834                        tunnel->err_count = 0;
 835        }
 836
 837        max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
 838
 839        if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
 840            (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
 841                struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
 842                if (max_headroom > dev->needed_headroom)
 843                        dev->needed_headroom = max_headroom;
 844                if (!new_skb) {
 845                        ip_rt_put(rt);
 846                        dev->stats.tx_dropped++;
 847                        dev_kfree_skb(skb);
 848                        return NETDEV_TX_OK;
 849                }
 850                if (skb->sk)
 851                        skb_set_owner_w(new_skb, skb->sk);
 852                dev_kfree_skb(skb);
 853                skb = new_skb;
 854                old_iph = ip_hdr(skb);
 855        }
 856
 857        skb_reset_transport_header(skb);
 858        skb_push(skb, gre_hlen);
 859        skb_reset_network_header(skb);
 860        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 861        IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
 862                              IPSKB_REROUTED);
 863        skb_dst_drop(skb);
 864        skb_dst_set(skb, &rt->dst);
 865
 866        /*
 867         *      Push down and install the IPIP header.
 868         */
 869
 870        iph                     =       ip_hdr(skb);
 871        iph->version            =       4;
 872        iph->ihl                =       sizeof(struct iphdr) >> 2;
 873        iph->frag_off           =       df;
 874        iph->protocol           =       IPPROTO_GRE;
 875        iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
 876        iph->daddr              =       fl4.daddr;
 877        iph->saddr              =       fl4.saddr;
 878
 879        if ((iph->ttl = tiph->ttl) == 0) {
 880                if (skb->protocol == htons(ETH_P_IP))
 881                        iph->ttl = old_iph->ttl;
 882#if IS_ENABLED(CONFIG_IPV6)
 883                else if (skb->protocol == htons(ETH_P_IPV6))
 884                        iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
 885#endif
 886                else
 887                        iph->ttl = ip4_dst_hoplimit(&rt->dst);
 888        }
 889
 890        ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
 891        ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
 892                                   htons(ETH_P_TEB) : skb->protocol;
 893
 894        if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
 895                __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
 896
 897                if (tunnel->parms.o_flags&GRE_SEQ) {
 898                        ++tunnel->o_seqno;
 899                        *ptr = htonl(tunnel->o_seqno);
 900                        ptr--;
 901                }
 902                if (tunnel->parms.o_flags&GRE_KEY) {
 903                        *ptr = tunnel->parms.o_key;
 904                        ptr--;
 905                }
 906                if (tunnel->parms.o_flags&GRE_CSUM) {
 907                        *ptr = 0;
 908                        *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
 909                }
 910        }
 911
 912        nf_reset(skb);
 913        tstats = this_cpu_ptr(dev->tstats);
 914        __IPTUNNEL_XMIT(tstats, &dev->stats);
 915        return NETDEV_TX_OK;
 916
 917tx_error_icmp:
 918        dst_link_failure(skb);
 919
 920tx_error:
 921        dev->stats.tx_errors++;
 922        dev_kfree_skb(skb);
 923        return NETDEV_TX_OK;
 924}
 925
 926static int ipgre_tunnel_bind_dev(struct net_device *dev)
 927{
 928        struct net_device *tdev = NULL;
 929        struct ip_tunnel *tunnel;
 930        const struct iphdr *iph;
 931        int hlen = LL_MAX_HEADER;
 932        int mtu = ETH_DATA_LEN;
 933        int addend = sizeof(struct iphdr) + 4;
 934
 935        tunnel = netdev_priv(dev);
 936        iph = &tunnel->parms.iph;
 937
 938        /* Guess output device to choose reasonable mtu and needed_headroom */
 939
 940        if (iph->daddr) {
 941                struct flowi4 fl4;
 942                struct rtable *rt;
 943
 944                rt = ip_route_output_gre(dev_net(dev), &fl4,
 945                                         iph->daddr, iph->saddr,
 946                                         tunnel->parms.o_key,
 947                                         RT_TOS(iph->tos),
 948                                         tunnel->parms.link);
 949                if (!IS_ERR(rt)) {
 950                        tdev = rt->dst.dev;
 951                        ip_rt_put(rt);
 952                }
 953
 954                if (dev->type != ARPHRD_ETHER)
 955                        dev->flags |= IFF_POINTOPOINT;
 956        }
 957
 958        if (!tdev && tunnel->parms.link)
 959                tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
 960
 961        if (tdev) {
 962                hlen = tdev->hard_header_len + tdev->needed_headroom;
 963                mtu = tdev->mtu;
 964        }
 965        dev->iflink = tunnel->parms.link;
 966
 967        /* Precalculate GRE options length */
 968        if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
 969                if (tunnel->parms.o_flags&GRE_CSUM)
 970                        addend += 4;
 971                if (tunnel->parms.o_flags&GRE_KEY)
 972                        addend += 4;
 973                if (tunnel->parms.o_flags&GRE_SEQ)
 974                        addend += 4;
 975        }
 976        dev->needed_headroom = addend + hlen;
 977        mtu -= dev->hard_header_len + addend;
 978
 979        if (mtu < 68)
 980                mtu = 68;
 981
 982        tunnel->hlen = addend;
 983
 984        return mtu;
 985}
 986
 987static int
 988ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 989{
 990        int err = 0;
 991        struct ip_tunnel_parm p;
 992        struct ip_tunnel *t;
 993        struct net *net = dev_net(dev);
 994        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 995
 996        switch (cmd) {
 997        case SIOCGETTUNNEL:
 998                t = NULL;
 999                if (dev == ign->fb_tunnel_dev) {
1000                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1001                                err = -EFAULT;
1002                                break;
1003                        }
1004                        t = ipgre_tunnel_locate(net, &p, 0);
1005                }
1006                if (t == NULL)
1007                        t = netdev_priv(dev);
1008                memcpy(&p, &t->parms, sizeof(p));
1009                if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1010                        err = -EFAULT;
1011                break;
1012
1013        case SIOCADDTUNNEL:
1014        case SIOCCHGTUNNEL:
1015                err = -EPERM;
1016                if (!capable(CAP_NET_ADMIN))
1017                        goto done;
1018
1019                err = -EFAULT;
1020                if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1021                        goto done;
1022
1023                err = -EINVAL;
1024                if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1025                    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1026                    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1027                        goto done;
1028                if (p.iph.ttl)
1029                        p.iph.frag_off |= htons(IP_DF);
1030
1031                if (!(p.i_flags&GRE_KEY))
1032                        p.i_key = 0;
1033                if (!(p.o_flags&GRE_KEY))
1034                        p.o_key = 0;
1035
1036                t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1037
1038                if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1039                        if (t != NULL) {
1040                                if (t->dev != dev) {
1041                                        err = -EEXIST;
1042                                        break;
1043                                }
1044                        } else {
1045                                unsigned int nflags = 0;
1046
1047                                t = netdev_priv(dev);
1048
1049                                if (ipv4_is_multicast(p.iph.daddr))
1050                                        nflags = IFF_BROADCAST;
1051                                else if (p.iph.daddr)
1052                                        nflags = IFF_POINTOPOINT;
1053
1054                                if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1055                                        err = -EINVAL;
1056                                        break;
1057                                }
1058                                ipgre_tunnel_unlink(ign, t);
1059                                synchronize_net();
1060                                t->parms.iph.saddr = p.iph.saddr;
1061                                t->parms.iph.daddr = p.iph.daddr;
1062                                t->parms.i_key = p.i_key;
1063                                t->parms.o_key = p.o_key;
1064                                memcpy(dev->dev_addr, &p.iph.saddr, 4);
1065                                memcpy(dev->broadcast, &p.iph.daddr, 4);
1066                                ipgre_tunnel_link(ign, t);
1067                                netdev_state_change(dev);
1068                        }
1069                }
1070
1071                if (t) {
1072                        err = 0;
1073                        if (cmd == SIOCCHGTUNNEL) {
1074                                t->parms.iph.ttl = p.iph.ttl;
1075                                t->parms.iph.tos = p.iph.tos;
1076                                t->parms.iph.frag_off = p.iph.frag_off;
1077                                if (t->parms.link != p.link) {
1078                                        t->parms.link = p.link;
1079                                        dev->mtu = ipgre_tunnel_bind_dev(dev);
1080                                        netdev_state_change(dev);
1081                                }
1082                        }
1083                        if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1084                                err = -EFAULT;
1085                } else
1086                        err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1087                break;
1088
1089        case SIOCDELTUNNEL:
1090                err = -EPERM;
1091                if (!capable(CAP_NET_ADMIN))
1092                        goto done;
1093
1094                if (dev == ign->fb_tunnel_dev) {
1095                        err = -EFAULT;
1096                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1097                                goto done;
1098                        err = -ENOENT;
1099                        if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1100                                goto done;
1101                        err = -EPERM;
1102                        if (t == netdev_priv(ign->fb_tunnel_dev))
1103                                goto done;
1104                        dev = t->dev;
1105                }
1106                unregister_netdevice(dev);
1107                err = 0;
1108                break;
1109
1110        default:
1111                err = -EINVAL;
1112        }
1113
1114done:
1115        return err;
1116}
1117
1118static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1119{
1120        struct ip_tunnel *tunnel = netdev_priv(dev);
1121        if (new_mtu < 68 ||
1122            new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1123                return -EINVAL;
1124        dev->mtu = new_mtu;
1125        return 0;
1126}
1127
1128/* Nice toy. Unfortunately, useless in real life :-)
1129   It allows to construct virtual multiprotocol broadcast "LAN"
1130   over the Internet, provided multicast routing is tuned.
1131
1132
1133   I have no idea was this bicycle invented before me,
1134   so that I had to set ARPHRD_IPGRE to a random value.
1135   I have an impression, that Cisco could make something similar,
1136   but this feature is apparently missing in IOS<=11.2(8).
1137
1138   I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1139   with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1140
1141   ping -t 255 224.66.66.66
1142
1143   If nobody answers, mbone does not work.
1144
1145   ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1146   ip addr add 10.66.66.<somewhat>/24 dev Universe
1147   ifconfig Universe up
1148   ifconfig Universe add fe80::<Your_real_addr>/10
1149   ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1150   ftp 10.66.66.66
1151   ...
1152   ftp fec0:6666:6666::193.233.7.65
1153   ...
1154
1155 */
1156
1157static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1158                        unsigned short type,
1159                        const void *daddr, const void *saddr, unsigned int len)
1160{
1161        struct ip_tunnel *t = netdev_priv(dev);
1162        struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1163        __be16 *p = (__be16*)(iph+1);
1164
1165        memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1166        p[0]            = t->parms.o_flags;
1167        p[1]            = htons(type);
1168
1169        /*
1170         *      Set the source hardware address.
1171         */
1172
1173        if (saddr)
1174                memcpy(&iph->saddr, saddr, 4);
1175        if (daddr)
1176                memcpy(&iph->daddr, daddr, 4);
1177        if (iph->daddr)
1178                return t->hlen;
1179
1180        return -t->hlen;
1181}
1182
1183static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1184{
1185        const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1186        memcpy(haddr, &iph->saddr, 4);
1187        return 4;
1188}
1189
1190static const struct header_ops ipgre_header_ops = {
1191        .create = ipgre_header,
1192        .parse  = ipgre_header_parse,
1193};
1194
1195#ifdef CONFIG_NET_IPGRE_BROADCAST
1196static int ipgre_open(struct net_device *dev)
1197{
1198        struct ip_tunnel *t = netdev_priv(dev);
1199
1200        if (ipv4_is_multicast(t->parms.iph.daddr)) {
1201                struct flowi4 fl4;
1202                struct rtable *rt;
1203
1204                rt = ip_route_output_gre(dev_net(dev), &fl4,
1205                                         t->parms.iph.daddr,
1206                                         t->parms.iph.saddr,
1207                                         t->parms.o_key,
1208                                         RT_TOS(t->parms.iph.tos),
1209                                         t->parms.link);
1210                if (IS_ERR(rt))
1211                        return -EADDRNOTAVAIL;
1212                dev = rt->dst.dev;
1213                ip_rt_put(rt);
1214                if (__in_dev_get_rtnl(dev) == NULL)
1215                        return -EADDRNOTAVAIL;
1216                t->mlink = dev->ifindex;
1217                ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1218        }
1219        return 0;
1220}
1221
1222static int ipgre_close(struct net_device *dev)
1223{
1224        struct ip_tunnel *t = netdev_priv(dev);
1225
1226        if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1227                struct in_device *in_dev;
1228                in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1229                if (in_dev)
1230                        ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1231        }
1232        return 0;
1233}
1234
1235#endif
1236
1237static const struct net_device_ops ipgre_netdev_ops = {
1238        .ndo_init               = ipgre_tunnel_init,
1239        .ndo_uninit             = ipgre_tunnel_uninit,
1240#ifdef CONFIG_NET_IPGRE_BROADCAST
1241        .ndo_open               = ipgre_open,
1242        .ndo_stop               = ipgre_close,
1243#endif
1244        .ndo_start_xmit         = ipgre_tunnel_xmit,
1245        .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1246        .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1247        .ndo_get_stats          = ipgre_get_stats,
1248};
1249
1250static void ipgre_dev_free(struct net_device *dev)
1251{
1252        free_percpu(dev->tstats);
1253        free_netdev(dev);
1254}
1255
1256static void ipgre_tunnel_setup(struct net_device *dev)
1257{
1258        dev->netdev_ops         = &ipgre_netdev_ops;
1259        dev->destructor         = ipgre_dev_free;
1260
1261        dev->type               = ARPHRD_IPGRE;
1262        dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1263        dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1264        dev->flags              = IFF_NOARP;
1265        dev->iflink             = 0;
1266        dev->addr_len           = 4;
1267        dev->features           |= NETIF_F_NETNS_LOCAL;
1268        dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1269}
1270
1271static int ipgre_tunnel_init(struct net_device *dev)
1272{
1273        struct ip_tunnel *tunnel;
1274        struct iphdr *iph;
1275
1276        tunnel = netdev_priv(dev);
1277        iph = &tunnel->parms.iph;
1278
1279        tunnel->dev = dev;
1280        strcpy(tunnel->parms.name, dev->name);
1281
1282        memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1283        memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1284
1285        if (iph->daddr) {
1286#ifdef CONFIG_NET_IPGRE_BROADCAST
1287                if (ipv4_is_multicast(iph->daddr)) {
1288                        if (!iph->saddr)
1289                                return -EINVAL;
1290                        dev->flags = IFF_BROADCAST;
1291                        dev->header_ops = &ipgre_header_ops;
1292                }
1293#endif
1294        } else
1295                dev->header_ops = &ipgre_header_ops;
1296
1297        dev->tstats = alloc_percpu(struct pcpu_tstats);
1298        if (!dev->tstats)
1299                return -ENOMEM;
1300
1301        return 0;
1302}
1303
1304static void ipgre_fb_tunnel_init(struct net_device *dev)
1305{
1306        struct ip_tunnel *tunnel = netdev_priv(dev);
1307        struct iphdr *iph = &tunnel->parms.iph;
1308
1309        tunnel->dev = dev;
1310        strcpy(tunnel->parms.name, dev->name);
1311
1312        iph->version            = 4;
1313        iph->protocol           = IPPROTO_GRE;
1314        iph->ihl                = 5;
1315        tunnel->hlen            = sizeof(struct iphdr) + 4;
1316
1317        dev_hold(dev);
1318}
1319
1320
1321static const struct gre_protocol ipgre_protocol = {
1322        .handler     = ipgre_rcv,
1323        .err_handler = ipgre_err,
1324};
1325
1326static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1327{
1328        int prio;
1329
1330        for (prio = 0; prio < 4; prio++) {
1331                int h;
1332                for (h = 0; h < HASH_SIZE; h++) {
1333                        struct ip_tunnel *t;
1334
1335                        t = rtnl_dereference(ign->tunnels[prio][h]);
1336
1337                        while (t != NULL) {
1338                                unregister_netdevice_queue(t->dev, head);
1339                                t = rtnl_dereference(t->next);
1340                        }
1341                }
1342        }
1343}
1344
1345static int __net_init ipgre_init_net(struct net *net)
1346{
1347        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1348        int err;
1349
1350        ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1351                                           ipgre_tunnel_setup);
1352        if (!ign->fb_tunnel_dev) {
1353                err = -ENOMEM;
1354                goto err_alloc_dev;
1355        }
1356        dev_net_set(ign->fb_tunnel_dev, net);
1357
1358        ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1359        ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1360
1361        if ((err = register_netdev(ign->fb_tunnel_dev)))
1362                goto err_reg_dev;
1363
1364        rcu_assign_pointer(ign->tunnels_wc[0],
1365                           netdev_priv(ign->fb_tunnel_dev));
1366        return 0;
1367
1368err_reg_dev:
1369        ipgre_dev_free(ign->fb_tunnel_dev);
1370err_alloc_dev:
1371        return err;
1372}
1373
1374static void __net_exit ipgre_exit_net(struct net *net)
1375{
1376        struct ipgre_net *ign;
1377        LIST_HEAD(list);
1378
1379        ign = net_generic(net, ipgre_net_id);
1380        rtnl_lock();
1381        ipgre_destroy_tunnels(ign, &list);
1382        unregister_netdevice_many(&list);
1383        rtnl_unlock();
1384}
1385
1386static struct pernet_operations ipgre_net_ops = {
1387        .init = ipgre_init_net,
1388        .exit = ipgre_exit_net,
1389        .id   = &ipgre_net_id,
1390        .size = sizeof(struct ipgre_net),
1391};
1392
1393static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1394{
1395        __be16 flags;
1396
1397        if (!data)
1398                return 0;
1399
1400        flags = 0;
1401        if (data[IFLA_GRE_IFLAGS])
1402                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1403        if (data[IFLA_GRE_OFLAGS])
1404                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1405        if (flags & (GRE_VERSION|GRE_ROUTING))
1406                return -EINVAL;
1407
1408        return 0;
1409}
1410
1411static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1412{
1413        __be32 daddr;
1414
1415        if (tb[IFLA_ADDRESS]) {
1416                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1417                        return -EINVAL;
1418                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1419                        return -EADDRNOTAVAIL;
1420        }
1421
1422        if (!data)
1423                goto out;
1424
1425        if (data[IFLA_GRE_REMOTE]) {
1426                memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1427                if (!daddr)
1428                        return -EINVAL;
1429        }
1430
1431out:
1432        return ipgre_tunnel_validate(tb, data);
1433}
1434
1435static void ipgre_netlink_parms(struct nlattr *data[],
1436                                struct ip_tunnel_parm *parms)
1437{
1438        memset(parms, 0, sizeof(*parms));
1439
1440        parms->iph.protocol = IPPROTO_GRE;
1441
1442        if (!data)
1443                return;
1444
1445        if (data[IFLA_GRE_LINK])
1446                parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1447
1448        if (data[IFLA_GRE_IFLAGS])
1449                parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1450
1451        if (data[IFLA_GRE_OFLAGS])
1452                parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1453
1454        if (data[IFLA_GRE_IKEY])
1455                parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1456
1457        if (data[IFLA_GRE_OKEY])
1458                parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1459
1460        if (data[IFLA_GRE_LOCAL])
1461                parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1462
1463        if (data[IFLA_GRE_REMOTE])
1464                parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1465
1466        if (data[IFLA_GRE_TTL])
1467                parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1468
1469        if (data[IFLA_GRE_TOS])
1470                parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1471
1472        if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1473                parms->iph.frag_off = htons(IP_DF);
1474}
1475
1476static int ipgre_tap_init(struct net_device *dev)
1477{
1478        struct ip_tunnel *tunnel;
1479
1480        tunnel = netdev_priv(dev);
1481
1482        tunnel->dev = dev;
1483        strcpy(tunnel->parms.name, dev->name);
1484
1485        ipgre_tunnel_bind_dev(dev);
1486
1487        dev->tstats = alloc_percpu(struct pcpu_tstats);
1488        if (!dev->tstats)
1489                return -ENOMEM;
1490
1491        return 0;
1492}
1493
1494static const struct net_device_ops ipgre_tap_netdev_ops = {
1495        .ndo_init               = ipgre_tap_init,
1496        .ndo_uninit             = ipgre_tunnel_uninit,
1497        .ndo_start_xmit         = ipgre_tunnel_xmit,
1498        .ndo_set_mac_address    = eth_mac_addr,
1499        .ndo_validate_addr      = eth_validate_addr,
1500        .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1501        .ndo_get_stats          = ipgre_get_stats,
1502};
1503
1504static void ipgre_tap_setup(struct net_device *dev)
1505{
1506
1507        ether_setup(dev);
1508
1509        dev->netdev_ops         = &ipgre_tap_netdev_ops;
1510        dev->destructor         = ipgre_dev_free;
1511
1512        dev->iflink             = 0;
1513        dev->features           |= NETIF_F_NETNS_LOCAL;
1514}
1515
1516static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1517                         struct nlattr *data[])
1518{
1519        struct ip_tunnel *nt;
1520        struct net *net = dev_net(dev);
1521        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1522        int mtu;
1523        int err;
1524
1525        nt = netdev_priv(dev);
1526        ipgre_netlink_parms(data, &nt->parms);
1527
1528        if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1529                return -EEXIST;
1530
1531        if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1532                random_ether_addr(dev->dev_addr);
1533
1534        mtu = ipgre_tunnel_bind_dev(dev);
1535        if (!tb[IFLA_MTU])
1536                dev->mtu = mtu;
1537
1538        /* Can use a lockless transmit, unless we generate output sequences */
1539        if (!(nt->parms.o_flags & GRE_SEQ))
1540                dev->features |= NETIF_F_LLTX;
1541
1542        err = register_netdevice(dev);
1543        if (err)
1544                goto out;
1545
1546        dev_hold(dev);
1547        ipgre_tunnel_link(ign, nt);
1548
1549out:
1550        return err;
1551}
1552
1553static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1554                            struct nlattr *data[])
1555{
1556        struct ip_tunnel *t, *nt;
1557        struct net *net = dev_net(dev);
1558        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1559        struct ip_tunnel_parm p;
1560        int mtu;
1561
1562        if (dev == ign->fb_tunnel_dev)
1563                return -EINVAL;
1564
1565        nt = netdev_priv(dev);
1566        ipgre_netlink_parms(data, &p);
1567
1568        t = ipgre_tunnel_locate(net, &p, 0);
1569
1570        if (t) {
1571                if (t->dev != dev)
1572                        return -EEXIST;
1573        } else {
1574                t = nt;
1575
1576                if (dev->type != ARPHRD_ETHER) {
1577                        unsigned int nflags = 0;
1578
1579                        if (ipv4_is_multicast(p.iph.daddr))
1580                                nflags = IFF_BROADCAST;
1581                        else if (p.iph.daddr)
1582                                nflags = IFF_POINTOPOINT;
1583
1584                        if ((dev->flags ^ nflags) &
1585                            (IFF_POINTOPOINT | IFF_BROADCAST))
1586                                return -EINVAL;
1587                }
1588
1589                ipgre_tunnel_unlink(ign, t);
1590                t->parms.iph.saddr = p.iph.saddr;
1591                t->parms.iph.daddr = p.iph.daddr;
1592                t->parms.i_key = p.i_key;
1593                if (dev->type != ARPHRD_ETHER) {
1594                        memcpy(dev->dev_addr, &p.iph.saddr, 4);
1595                        memcpy(dev->broadcast, &p.iph.daddr, 4);
1596                }
1597                ipgre_tunnel_link(ign, t);
1598                netdev_state_change(dev);
1599        }
1600
1601        t->parms.o_key = p.o_key;
1602        t->parms.iph.ttl = p.iph.ttl;
1603        t->parms.iph.tos = p.iph.tos;
1604        t->parms.iph.frag_off = p.iph.frag_off;
1605
1606        if (t->parms.link != p.link) {
1607                t->parms.link = p.link;
1608                mtu = ipgre_tunnel_bind_dev(dev);
1609                if (!tb[IFLA_MTU])
1610                        dev->mtu = mtu;
1611                netdev_state_change(dev);
1612        }
1613
1614        return 0;
1615}
1616
1617static size_t ipgre_get_size(const struct net_device *dev)
1618{
1619        return
1620                /* IFLA_GRE_LINK */
1621                nla_total_size(4) +
1622                /* IFLA_GRE_IFLAGS */
1623                nla_total_size(2) +
1624                /* IFLA_GRE_OFLAGS */
1625                nla_total_size(2) +
1626                /* IFLA_GRE_IKEY */
1627                nla_total_size(4) +
1628                /* IFLA_GRE_OKEY */
1629                nla_total_size(4) +
1630                /* IFLA_GRE_LOCAL */
1631                nla_total_size(4) +
1632                /* IFLA_GRE_REMOTE */
1633                nla_total_size(4) +
1634                /* IFLA_GRE_TTL */
1635                nla_total_size(1) +
1636                /* IFLA_GRE_TOS */
1637                nla_total_size(1) +
1638                /* IFLA_GRE_PMTUDISC */
1639                nla_total_size(1) +
1640                0;
1641}
1642
1643static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1644{
1645        struct ip_tunnel *t = netdev_priv(dev);
1646        struct ip_tunnel_parm *p = &t->parms;
1647
1648        NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1649        NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1650        NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1651        NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1652        NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1653        NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1654        NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1655        NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1656        NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1657        NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1658
1659        return 0;
1660
1661nla_put_failure:
1662        return -EMSGSIZE;
1663}
1664
1665static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1666        [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1667        [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1668        [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1669        [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1670        [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1671        [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1672        [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1673        [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1674        [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1675        [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1676};
1677
1678static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1679        .kind           = "gre",
1680        .maxtype        = IFLA_GRE_MAX,
1681        .policy         = ipgre_policy,
1682        .priv_size      = sizeof(struct ip_tunnel),
1683        .setup          = ipgre_tunnel_setup,
1684        .validate       = ipgre_tunnel_validate,
1685        .newlink        = ipgre_newlink,
1686        .changelink     = ipgre_changelink,
1687        .get_size       = ipgre_get_size,
1688        .fill_info      = ipgre_fill_info,
1689};
1690
1691static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1692        .kind           = "gretap",
1693        .maxtype        = IFLA_GRE_MAX,
1694        .policy         = ipgre_policy,
1695        .priv_size      = sizeof(struct ip_tunnel),
1696        .setup          = ipgre_tap_setup,
1697        .validate       = ipgre_tap_validate,
1698        .newlink        = ipgre_newlink,
1699        .changelink     = ipgre_changelink,
1700        .get_size       = ipgre_get_size,
1701        .fill_info      = ipgre_fill_info,
1702};
1703
1704/*
1705 *      And now the modules code and kernel interface.
1706 */
1707
1708static int __init ipgre_init(void)
1709{
1710        int err;
1711
1712        printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1713
1714        err = register_pernet_device(&ipgre_net_ops);
1715        if (err < 0)
1716                return err;
1717
1718        err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1719        if (err < 0) {
1720                printk(KERN_INFO "ipgre init: can't add protocol\n");
1721                goto add_proto_failed;
1722        }
1723
1724        err = rtnl_link_register(&ipgre_link_ops);
1725        if (err < 0)
1726                goto rtnl_link_failed;
1727
1728        err = rtnl_link_register(&ipgre_tap_ops);
1729        if (err < 0)
1730                goto tap_ops_failed;
1731
1732out:
1733        return err;
1734
1735tap_ops_failed:
1736        rtnl_link_unregister(&ipgre_link_ops);
1737rtnl_link_failed:
1738        gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1739add_proto_failed:
1740        unregister_pernet_device(&ipgre_net_ops);
1741        goto out;
1742}
1743
1744static void __exit ipgre_fini(void)
1745{
1746        rtnl_link_unregister(&ipgre_tap_ops);
1747        rtnl_link_unregister(&ipgre_link_ops);
1748        if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1749                printk(KERN_INFO "ipgre close: can't remove protocol\n");
1750        unregister_pernet_device(&ipgre_net_ops);
1751}
1752
1753module_init(ipgre_init);
1754module_exit(ipgre_fini);
1755MODULE_LICENSE("GPL");
1756MODULE_ALIAS_RTNL_LINK("gre");
1757MODULE_ALIAS_RTNL_LINK("gretap");
1758MODULE_ALIAS_NETDEV("gre0");
1759