linux/net/ipv4/ip_gre.c
<<
>>
Prefs
   1/*
   2 *      Linux NET3:     GRE over IP protocol decoder.
   3 *
   4 *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
   5 *
   6 *      This program is free software; you can redistribute it and/or
   7 *      modify it under the terms of the GNU General Public License
   8 *      as published by the Free Software Foundation; either version
   9 *      2 of the License, or (at your option) any later version.
  10 *
  11 */
  12
  13#include <linux/capability.h>
  14#include <linux/module.h>
  15#include <linux/types.h>
  16#include <linux/kernel.h>
  17#include <linux/slab.h>
  18#include <asm/uaccess.h>
  19#include <linux/skbuff.h>
  20#include <linux/netdevice.h>
  21#include <linux/in.h>
  22#include <linux/tcp.h>
  23#include <linux/udp.h>
  24#include <linux/if_arp.h>
  25#include <linux/mroute.h>
  26#include <linux/init.h>
  27#include <linux/in6.h>
  28#include <linux/inetdevice.h>
  29#include <linux/igmp.h>
  30#include <linux/netfilter_ipv4.h>
  31#include <linux/etherdevice.h>
  32#include <linux/if_ether.h>
  33
  34#include <net/sock.h>
  35#include <net/ip.h>
  36#include <net/icmp.h>
  37#include <net/protocol.h>
  38#include <net/ipip.h>
  39#include <net/arp.h>
  40#include <net/checksum.h>
  41#include <net/dsfield.h>
  42#include <net/inet_ecn.h>
  43#include <net/xfrm.h>
  44#include <net/net_namespace.h>
  45#include <net/netns/generic.h>
  46#include <net/rtnetlink.h>
  47#include <net/gre.h>
  48
  49#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
  50#include <net/ipv6.h>
  51#include <net/ip6_fib.h>
  52#include <net/ip6_route.h>
  53#endif
  54
  55/*
  56   Problems & solutions
  57   --------------------
  58
  59   1. The most important issue is detecting local dead loops.
  60   They would cause complete host lockup in transmit, which
  61   would be "resolved" by stack overflow or, if queueing is enabled,
  62   with infinite looping in net_bh.
  63
  64   We cannot track such dead loops during route installation,
  65   it is infeasible task. The most general solutions would be
  66   to keep skb->encapsulation counter (sort of local ttl),
  67   and silently drop packet when it expires. It is a good
  68   solution, but it supposes maintaing new variable in ALL
  69   skb, even if no tunneling is used.
  70
  71   Current solution: xmit_recursion breaks dead loops. This is a percpu
  72   counter, since when we enter the first ndo_xmit(), cpu migration is
  73   forbidden. We force an exit if this counter reaches RECURSION_LIMIT
  74
  75   2. Networking dead loops would not kill routers, but would really
  76   kill network. IP hop limit plays role of "t->recursion" in this case,
  77   if we copy it from packet being encapsulated to upper header.
  78   It is very good solution, but it introduces two problems:
  79
  80   - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
  81     do not work over tunnels.
  82   - traceroute does not work. I planned to relay ICMP from tunnel,
  83     so that this problem would be solved and traceroute output
  84     would even more informative. This idea appeared to be wrong:
  85     only Linux complies to rfc1812 now (yes, guys, Linux is the only
  86     true router now :-)), all routers (at least, in neighbourhood of mine)
  87     return only 8 bytes of payload. It is the end.
  88
  89   Hence, if we want that OSPF worked or traceroute said something reasonable,
  90   we should search for another solution.
  91
  92   One of them is to parse packet trying to detect inner encapsulation
  93   made by our node. It is difficult or even impossible, especially,
  94   taking into account fragmentation. TO be short, tt is not solution at all.
  95
  96   Current solution: The solution was UNEXPECTEDLY SIMPLE.
  97   We force DF flag on tunnels with preconfigured hop limit,
  98   that is ALL. :-) Well, it does not remove the problem completely,
  99   but exponential growth of network traffic is changed to linear
 100   (branches, that exceed pmtu are pruned) and tunnel mtu
 101   fastly degrades to value <68, where looping stops.
 102   Yes, it is not good if there exists a router in the loop,
 103   which does not force DF, even when encapsulating packets have DF set.
 104   But it is not our problem! Nobody could accuse us, we made
 105   all that we could make. Even if it is your gated who injected
 106   fatal route to network, even if it were you who configured
 107   fatal static route: you are innocent. :-)
 108
 109
 110
 111   3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
 112   practically identical code. It would be good to glue them
 113   together, but it is not very evident, how to make them modular.
 114   sit is integral part of IPv6, ipip and gre are naturally modular.
 115   We could extract common parts (hash table, ioctl etc)
 116   to a separate module (ip_tunnel.c).
 117
 118   Alexey Kuznetsov.
 119 */
 120
 121static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 122static int ipgre_tunnel_init(struct net_device *dev);
 123static void ipgre_tunnel_setup(struct net_device *dev);
 124static int ipgre_tunnel_bind_dev(struct net_device *dev);
 125
 126/* Fallback tunnel: no source, no destination, no key, no options */
 127
 128#define HASH_SIZE  16
 129
 130static int ipgre_net_id __read_mostly;
 131struct ipgre_net {
 132        struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
 133
 134        struct net_device *fb_tunnel_dev;
 135};
 136
 137/* Tunnel hash table */
 138
 139/*
 140   4 hash tables:
 141
 142   3: (remote,local)
 143   2: (remote,*)
 144   1: (*,local)
 145   0: (*,*)
 146
 147   We require exact key match i.e. if a key is present in packet
 148   it will match only tunnel with the same key; if it is not present,
 149   it will match only keyless tunnel.
 150
 151   All keysless packets, if not matched configured keyless tunnels
 152   will match fallback tunnel.
 153 */
 154
 155#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
 156
 157#define tunnels_r_l     tunnels[3]
 158#define tunnels_r       tunnels[2]
 159#define tunnels_l       tunnels[1]
 160#define tunnels_wc      tunnels[0]
 161/*
 162 * Locking : hash tables are protected by RCU and RTNL
 163 */
 164
 165#define for_each_ip_tunnel_rcu(start) \
 166        for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
 167
 168/* often modified stats are per cpu, other are shared (netdev->stats) */
 169struct pcpu_tstats {
 170        unsigned long   rx_packets;
 171        unsigned long   rx_bytes;
 172        unsigned long   tx_packets;
 173        unsigned long   tx_bytes;
 174};
 175
 176static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
 177{
 178        struct pcpu_tstats sum = { 0 };
 179        int i;
 180
 181        for_each_possible_cpu(i) {
 182                const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
 183
 184                sum.rx_packets += tstats->rx_packets;
 185                sum.rx_bytes   += tstats->rx_bytes;
 186                sum.tx_packets += tstats->tx_packets;
 187                sum.tx_bytes   += tstats->tx_bytes;
 188        }
 189        dev->stats.rx_packets = sum.rx_packets;
 190        dev->stats.rx_bytes   = sum.rx_bytes;
 191        dev->stats.tx_packets = sum.tx_packets;
 192        dev->stats.tx_bytes   = sum.tx_bytes;
 193        return &dev->stats;
 194}
 195
 196/* Given src, dst and key, find appropriate for input tunnel. */
 197
 198static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
 199                                              __be32 remote, __be32 local,
 200                                              __be32 key, __be16 gre_proto)
 201{
 202        struct net *net = dev_net(dev);
 203        int link = dev->ifindex;
 204        unsigned int h0 = HASH(remote);
 205        unsigned int h1 = HASH(key);
 206        struct ip_tunnel *t, *cand = NULL;
 207        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 208        int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
 209                       ARPHRD_ETHER : ARPHRD_IPGRE;
 210        int score, cand_score = 4;
 211
 212        for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
 213                if (local != t->parms.iph.saddr ||
 214                    remote != t->parms.iph.daddr ||
 215                    key != t->parms.i_key ||
 216                    !(t->dev->flags & IFF_UP))
 217                        continue;
 218
 219                if (t->dev->type != ARPHRD_IPGRE &&
 220                    t->dev->type != dev_type)
 221                        continue;
 222
 223                score = 0;
 224                if (t->parms.link != link)
 225                        score |= 1;
 226                if (t->dev->type != dev_type)
 227                        score |= 2;
 228                if (score == 0)
 229                        return t;
 230
 231                if (score < cand_score) {
 232                        cand = t;
 233                        cand_score = score;
 234                }
 235        }
 236
 237        for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
 238                if (remote != t->parms.iph.daddr ||
 239                    key != t->parms.i_key ||
 240                    !(t->dev->flags & IFF_UP))
 241                        continue;
 242
 243                if (t->dev->type != ARPHRD_IPGRE &&
 244                    t->dev->type != dev_type)
 245                        continue;
 246
 247                score = 0;
 248                if (t->parms.link != link)
 249                        score |= 1;
 250                if (t->dev->type != dev_type)
 251                        score |= 2;
 252                if (score == 0)
 253                        return t;
 254
 255                if (score < cand_score) {
 256                        cand = t;
 257                        cand_score = score;
 258                }
 259        }
 260
 261        for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
 262                if ((local != t->parms.iph.saddr &&
 263                     (local != t->parms.iph.daddr ||
 264                      !ipv4_is_multicast(local))) ||
 265                    key != t->parms.i_key ||
 266                    !(t->dev->flags & IFF_UP))
 267                        continue;
 268
 269                if (t->dev->type != ARPHRD_IPGRE &&
 270                    t->dev->type != dev_type)
 271                        continue;
 272
 273                score = 0;
 274                if (t->parms.link != link)
 275                        score |= 1;
 276                if (t->dev->type != dev_type)
 277                        score |= 2;
 278                if (score == 0)
 279                        return t;
 280
 281                if (score < cand_score) {
 282                        cand = t;
 283                        cand_score = score;
 284                }
 285        }
 286
 287        for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
 288                if (t->parms.i_key != key ||
 289                    !(t->dev->flags & IFF_UP))
 290                        continue;
 291
 292                if (t->dev->type != ARPHRD_IPGRE &&
 293                    t->dev->type != dev_type)
 294                        continue;
 295
 296                score = 0;
 297                if (t->parms.link != link)
 298                        score |= 1;
 299                if (t->dev->type != dev_type)
 300                        score |= 2;
 301                if (score == 0)
 302                        return t;
 303
 304                if (score < cand_score) {
 305                        cand = t;
 306                        cand_score = score;
 307                }
 308        }
 309
 310        if (cand != NULL)
 311                return cand;
 312
 313        dev = ign->fb_tunnel_dev;
 314        if (dev->flags & IFF_UP)
 315                return netdev_priv(dev);
 316
 317        return NULL;
 318}
 319
 320static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
 321                struct ip_tunnel_parm *parms)
 322{
 323        __be32 remote = parms->iph.daddr;
 324        __be32 local = parms->iph.saddr;
 325        __be32 key = parms->i_key;
 326        unsigned int h = HASH(key);
 327        int prio = 0;
 328
 329        if (local)
 330                prio |= 1;
 331        if (remote && !ipv4_is_multicast(remote)) {
 332                prio |= 2;
 333                h ^= HASH(remote);
 334        }
 335
 336        return &ign->tunnels[prio][h];
 337}
 338
 339static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
 340                struct ip_tunnel *t)
 341{
 342        return __ipgre_bucket(ign, &t->parms);
 343}
 344
 345static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
 346{
 347        struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
 348
 349        rcu_assign_pointer(t->next, rtnl_dereference(*tp));
 350        rcu_assign_pointer(*tp, t);
 351}
 352
 353static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
 354{
 355        struct ip_tunnel __rcu **tp;
 356        struct ip_tunnel *iter;
 357
 358        for (tp = ipgre_bucket(ign, t);
 359             (iter = rtnl_dereference(*tp)) != NULL;
 360             tp = &iter->next) {
 361                if (t == iter) {
 362                        rcu_assign_pointer(*tp, t->next);
 363                        break;
 364                }
 365        }
 366}
 367
 368static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
 369                                           struct ip_tunnel_parm *parms,
 370                                           int type)
 371{
 372        __be32 remote = parms->iph.daddr;
 373        __be32 local = parms->iph.saddr;
 374        __be32 key = parms->i_key;
 375        int link = parms->link;
 376        struct ip_tunnel *t;
 377        struct ip_tunnel __rcu **tp;
 378        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 379
 380        for (tp = __ipgre_bucket(ign, parms);
 381             (t = rtnl_dereference(*tp)) != NULL;
 382             tp = &t->next)
 383                if (local == t->parms.iph.saddr &&
 384                    remote == t->parms.iph.daddr &&
 385                    key == t->parms.i_key &&
 386                    link == t->parms.link &&
 387                    type == t->dev->type)
 388                        break;
 389
 390        return t;
 391}
 392
 393static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
 394                struct ip_tunnel_parm *parms, int create)
 395{
 396        struct ip_tunnel *t, *nt;
 397        struct net_device *dev;
 398        char name[IFNAMSIZ];
 399        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 400
 401        t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
 402        if (t || !create)
 403                return t;
 404
 405        if (parms->name[0])
 406                strlcpy(name, parms->name, IFNAMSIZ);
 407        else
 408                strcpy(name, "gre%d");
 409
 410        dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
 411        if (!dev)
 412                return NULL;
 413
 414        dev_net_set(dev, net);
 415
 416        nt = netdev_priv(dev);
 417        nt->parms = *parms;
 418        dev->rtnl_link_ops = &ipgre_link_ops;
 419
 420        dev->mtu = ipgre_tunnel_bind_dev(dev);
 421
 422        if (register_netdevice(dev) < 0)
 423                goto failed_free;
 424
 425        dev_hold(dev);
 426        ipgre_tunnel_link(ign, nt);
 427        return nt;
 428
 429failed_free:
 430        free_netdev(dev);
 431        return NULL;
 432}
 433
 434static void ipgre_tunnel_uninit(struct net_device *dev)
 435{
 436        struct net *net = dev_net(dev);
 437        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 438
 439        ipgre_tunnel_unlink(ign, netdev_priv(dev));
 440        dev_put(dev);
 441}
 442
 443
 444static void ipgre_err(struct sk_buff *skb, u32 info)
 445{
 446
 447/* All the routers (except for Linux) return only
 448   8 bytes of packet payload. It means, that precise relaying of
 449   ICMP in the real Internet is absolutely infeasible.
 450
 451   Moreover, Cisco "wise men" put GRE key to the third word
 452   in GRE header. It makes impossible maintaining even soft state for keyed
 453   GRE tunnels with enabled checksum. Tell them "thank you".
 454
 455   Well, I wonder, rfc1812 was written by Cisco employee,
 456   what the hell these idiots break standrads established
 457   by themself???
 458 */
 459
 460        const struct iphdr *iph = (const struct iphdr *)skb->data;
 461        __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
 462        int grehlen = (iph->ihl<<2) + 4;
 463        const int type = icmp_hdr(skb)->type;
 464        const int code = icmp_hdr(skb)->code;
 465        struct ip_tunnel *t;
 466        __be16 flags;
 467
 468        flags = p[0];
 469        if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
 470                if (flags&(GRE_VERSION|GRE_ROUTING))
 471                        return;
 472                if (flags&GRE_KEY) {
 473                        grehlen += 4;
 474                        if (flags&GRE_CSUM)
 475                                grehlen += 4;
 476                }
 477        }
 478
 479        /* If only 8 bytes returned, keyed message will be dropped here */
 480        if (skb_headlen(skb) < grehlen)
 481                return;
 482
 483        switch (type) {
 484        default:
 485        case ICMP_PARAMETERPROB:
 486                return;
 487
 488        case ICMP_DEST_UNREACH:
 489                switch (code) {
 490                case ICMP_SR_FAILED:
 491                case ICMP_PORT_UNREACH:
 492                        /* Impossible event. */
 493                        return;
 494                case ICMP_FRAG_NEEDED:
 495                        /* Soft state for pmtu is maintained by IP core. */
 496                        return;
 497                default:
 498                        /* All others are translated to HOST_UNREACH.
 499                           rfc2003 contains "deep thoughts" about NET_UNREACH,
 500                           I believe they are just ether pollution. --ANK
 501                         */
 502                        break;
 503                }
 504                break;
 505        case ICMP_TIME_EXCEEDED:
 506                if (code != ICMP_EXC_TTL)
 507                        return;
 508                break;
 509        }
 510
 511        rcu_read_lock();
 512        t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
 513                                flags & GRE_KEY ?
 514                                *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
 515                                p[1]);
 516        if (t == NULL || t->parms.iph.daddr == 0 ||
 517            ipv4_is_multicast(t->parms.iph.daddr))
 518                goto out;
 519
 520        if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
 521                goto out;
 522
 523        if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
 524                t->err_count++;
 525        else
 526                t->err_count = 1;
 527        t->err_time = jiffies;
 528out:
 529        rcu_read_unlock();
 530}
 531
 532static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
 533{
 534        if (INET_ECN_is_ce(iph->tos)) {
 535                if (skb->protocol == htons(ETH_P_IP)) {
 536                        IP_ECN_set_ce(ip_hdr(skb));
 537                } else if (skb->protocol == htons(ETH_P_IPV6)) {
 538                        IP6_ECN_set_ce(ipv6_hdr(skb));
 539                }
 540        }
 541}
 542
 543static inline u8
 544ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
 545{
 546        u8 inner = 0;
 547        if (skb->protocol == htons(ETH_P_IP))
 548                inner = old_iph->tos;
 549        else if (skb->protocol == htons(ETH_P_IPV6))
 550                inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
 551        return INET_ECN_encapsulate(tos, inner);
 552}
 553
 554static int ipgre_rcv(struct sk_buff *skb)
 555{
 556        const struct iphdr *iph;
 557        u8     *h;
 558        __be16    flags;
 559        __sum16   csum = 0;
 560        __be32 key = 0;
 561        u32    seqno = 0;
 562        struct ip_tunnel *tunnel;
 563        int    offset = 4;
 564        __be16 gre_proto;
 565
 566        if (!pskb_may_pull(skb, 16))
 567                goto drop_nolock;
 568
 569        iph = ip_hdr(skb);
 570        h = skb->data;
 571        flags = *(__be16*)h;
 572
 573        if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
 574                /* - Version must be 0.
 575                   - We do not support routing headers.
 576                 */
 577                if (flags&(GRE_VERSION|GRE_ROUTING))
 578                        goto drop_nolock;
 579
 580                if (flags&GRE_CSUM) {
 581                        switch (skb->ip_summed) {
 582                        case CHECKSUM_COMPLETE:
 583                                csum = csum_fold(skb->csum);
 584                                if (!csum)
 585                                        break;
 586                                /* fall through */
 587                        case CHECKSUM_NONE:
 588                                skb->csum = 0;
 589                                csum = __skb_checksum_complete(skb);
 590                                skb->ip_summed = CHECKSUM_COMPLETE;
 591                        }
 592                        offset += 4;
 593                }
 594                if (flags&GRE_KEY) {
 595                        key = *(__be32*)(h + offset);
 596                        offset += 4;
 597                }
 598                if (flags&GRE_SEQ) {
 599                        seqno = ntohl(*(__be32*)(h + offset));
 600                        offset += 4;
 601                }
 602        }
 603
 604        gre_proto = *(__be16 *)(h + 2);
 605
 606        rcu_read_lock();
 607        if ((tunnel = ipgre_tunnel_lookup(skb->dev,
 608                                          iph->saddr, iph->daddr, key,
 609                                          gre_proto))) {
 610                struct pcpu_tstats *tstats;
 611
 612                secpath_reset(skb);
 613
 614                skb->protocol = gre_proto;
 615                /* WCCP version 1 and 2 protocol decoding.
 616                 * - Change protocol to IP
 617                 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
 618                 */
 619                if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
 620                        skb->protocol = htons(ETH_P_IP);
 621                        if ((*(h + offset) & 0xF0) != 0x40)
 622                                offset += 4;
 623                }
 624
 625                skb->mac_header = skb->network_header;
 626                __pskb_pull(skb, offset);
 627                skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
 628                skb->pkt_type = PACKET_HOST;
 629#ifdef CONFIG_NET_IPGRE_BROADCAST
 630                if (ipv4_is_multicast(iph->daddr)) {
 631                        /* Looped back packet, drop it! */
 632                        if (rt_is_output_route(skb_rtable(skb)))
 633                                goto drop;
 634                        tunnel->dev->stats.multicast++;
 635                        skb->pkt_type = PACKET_BROADCAST;
 636                }
 637#endif
 638
 639                if (((flags&GRE_CSUM) && csum) ||
 640                    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
 641                        tunnel->dev->stats.rx_crc_errors++;
 642                        tunnel->dev->stats.rx_errors++;
 643                        goto drop;
 644                }
 645                if (tunnel->parms.i_flags&GRE_SEQ) {
 646                        if (!(flags&GRE_SEQ) ||
 647                            (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
 648                                tunnel->dev->stats.rx_fifo_errors++;
 649                                tunnel->dev->stats.rx_errors++;
 650                                goto drop;
 651                        }
 652                        tunnel->i_seqno = seqno + 1;
 653                }
 654
 655                /* Warning: All skb pointers will be invalidated! */
 656                if (tunnel->dev->type == ARPHRD_ETHER) {
 657                        if (!pskb_may_pull(skb, ETH_HLEN)) {
 658                                tunnel->dev->stats.rx_length_errors++;
 659                                tunnel->dev->stats.rx_errors++;
 660                                goto drop;
 661                        }
 662
 663                        iph = ip_hdr(skb);
 664                        skb->protocol = eth_type_trans(skb, tunnel->dev);
 665                        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 666                }
 667
 668                tstats = this_cpu_ptr(tunnel->dev->tstats);
 669                tstats->rx_packets++;
 670                tstats->rx_bytes += skb->len;
 671
 672                __skb_tunnel_rx(skb, tunnel->dev);
 673
 674                skb_reset_network_header(skb);
 675                ipgre_ecn_decapsulate(iph, skb);
 676
 677                netif_rx(skb);
 678
 679                rcu_read_unlock();
 680                return 0;
 681        }
 682        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 683
 684drop:
 685        rcu_read_unlock();
 686drop_nolock:
 687        kfree_skb(skb);
 688        return 0;
 689}
 690
 691static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 692{
 693        struct ip_tunnel *tunnel = netdev_priv(dev);
 694        struct pcpu_tstats *tstats;
 695        const struct iphdr  *old_iph = ip_hdr(skb);
 696        const struct iphdr  *tiph;
 697        struct flowi4 fl4;
 698        u8     tos;
 699        __be16 df;
 700        struct rtable *rt;                      /* Route to the other host */
 701        struct net_device *tdev;                /* Device to other host */
 702        struct iphdr  *iph;                     /* Our new IP header */
 703        unsigned int max_headroom;              /* The extra header space needed */
 704        int    gre_hlen;
 705        __be32 dst;
 706        int    mtu;
 707
 708        if (dev->type == ARPHRD_ETHER)
 709                IPCB(skb)->flags = 0;
 710
 711        if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
 712                gre_hlen = 0;
 713                tiph = (const struct iphdr *)skb->data;
 714        } else {
 715                gre_hlen = tunnel->hlen;
 716                tiph = &tunnel->parms.iph;
 717        }
 718
 719        if ((dst = tiph->daddr) == 0) {
 720                /* NBMA tunnel */
 721
 722                if (skb_dst(skb) == NULL) {
 723                        dev->stats.tx_fifo_errors++;
 724                        goto tx_error;
 725                }
 726
 727                if (skb->protocol == htons(ETH_P_IP)) {
 728                        rt = skb_rtable(skb);
 729                        if ((dst = rt->rt_gateway) == 0)
 730                                goto tx_error_icmp;
 731                }
 732#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 733                else if (skb->protocol == htons(ETH_P_IPV6)) {
 734                        struct neighbour *neigh = dst_get_neighbour(skb_dst(skb));
 735                        const struct in6_addr *addr6;
 736                        int addr_type;
 737
 738                        if (neigh == NULL)
 739                                goto tx_error;
 740
 741                        addr6 = (const struct in6_addr *)&neigh->primary_key;
 742                        addr_type = ipv6_addr_type(addr6);
 743
 744                        if (addr_type == IPV6_ADDR_ANY) {
 745                                addr6 = &ipv6_hdr(skb)->daddr;
 746                                addr_type = ipv6_addr_type(addr6);
 747                        }
 748
 749                        if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
 750                                goto tx_error_icmp;
 751
 752                        dst = addr6->s6_addr32[3];
 753                }
 754#endif
 755                else
 756                        goto tx_error;
 757        }
 758
 759        tos = tiph->tos;
 760        if (tos == 1) {
 761                tos = 0;
 762                if (skb->protocol == htons(ETH_P_IP))
 763                        tos = old_iph->tos;
 764                else if (skb->protocol == htons(ETH_P_IPV6))
 765                        tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
 766        }
 767
 768        rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
 769                                 tunnel->parms.o_key, RT_TOS(tos),
 770                                 tunnel->parms.link);
 771        if (IS_ERR(rt)) {
 772                dev->stats.tx_carrier_errors++;
 773                goto tx_error;
 774        }
 775        tdev = rt->dst.dev;
 776
 777        if (tdev == dev) {
 778                ip_rt_put(rt);
 779                dev->stats.collisions++;
 780                goto tx_error;
 781        }
 782
 783        df = tiph->frag_off;
 784        if (df)
 785                mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
 786        else
 787                mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 788
 789        if (skb_dst(skb))
 790                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 791
 792        if (skb->protocol == htons(ETH_P_IP)) {
 793                df |= (old_iph->frag_off&htons(IP_DF));
 794
 795                if ((old_iph->frag_off&htons(IP_DF)) &&
 796                    mtu < ntohs(old_iph->tot_len)) {
 797                        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 798                        ip_rt_put(rt);
 799                        goto tx_error;
 800                }
 801        }
 802#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 803        else if (skb->protocol == htons(ETH_P_IPV6)) {
 804                struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
 805
 806                if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
 807                        if ((tunnel->parms.iph.daddr &&
 808                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
 809                            rt6->rt6i_dst.plen == 128) {
 810                                rt6->rt6i_flags |= RTF_MODIFIED;
 811                                dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
 812                        }
 813                }
 814
 815                if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
 816                        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 817                        ip_rt_put(rt);
 818                        goto tx_error;
 819                }
 820        }
 821#endif
 822
 823        if (tunnel->err_count > 0) {
 824                if (time_before(jiffies,
 825                                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
 826                        tunnel->err_count--;
 827
 828                        dst_link_failure(skb);
 829                } else
 830                        tunnel->err_count = 0;
 831        }
 832
 833        max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
 834
 835        if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
 836            (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
 837                struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
 838                if (max_headroom > dev->needed_headroom)
 839                        dev->needed_headroom = max_headroom;
 840                if (!new_skb) {
 841                        ip_rt_put(rt);
 842                        dev->stats.tx_dropped++;
 843                        dev_kfree_skb(skb);
 844                        return NETDEV_TX_OK;
 845                }
 846                if (skb->sk)
 847                        skb_set_owner_w(new_skb, skb->sk);
 848                dev_kfree_skb(skb);
 849                skb = new_skb;
 850                old_iph = ip_hdr(skb);
 851        }
 852
 853        skb_reset_transport_header(skb);
 854        skb_push(skb, gre_hlen);
 855        skb_reset_network_header(skb);
 856        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 857        IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
 858                              IPSKB_REROUTED);
 859        skb_dst_drop(skb);
 860        skb_dst_set(skb, &rt->dst);
 861
 862        /*
 863         *      Push down and install the IPIP header.
 864         */
 865
 866        iph                     =       ip_hdr(skb);
 867        iph->version            =       4;
 868        iph->ihl                =       sizeof(struct iphdr) >> 2;
 869        iph->frag_off           =       df;
 870        iph->protocol           =       IPPROTO_GRE;
 871        iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
 872        iph->daddr              =       fl4.daddr;
 873        iph->saddr              =       fl4.saddr;
 874
 875        if ((iph->ttl = tiph->ttl) == 0) {
 876                if (skb->protocol == htons(ETH_P_IP))
 877                        iph->ttl = old_iph->ttl;
 878#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 879                else if (skb->protocol == htons(ETH_P_IPV6))
 880                        iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
 881#endif
 882                else
 883                        iph->ttl = ip4_dst_hoplimit(&rt->dst);
 884        }
 885
 886        ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
 887        ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
 888                                   htons(ETH_P_TEB) : skb->protocol;
 889
 890        if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
 891                __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
 892
 893                if (tunnel->parms.o_flags&GRE_SEQ) {
 894                        ++tunnel->o_seqno;
 895                        *ptr = htonl(tunnel->o_seqno);
 896                        ptr--;
 897                }
 898                if (tunnel->parms.o_flags&GRE_KEY) {
 899                        *ptr = tunnel->parms.o_key;
 900                        ptr--;
 901                }
 902                if (tunnel->parms.o_flags&GRE_CSUM) {
 903                        *ptr = 0;
 904                        *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
 905                }
 906        }
 907
 908        nf_reset(skb);
 909        tstats = this_cpu_ptr(dev->tstats);
 910        __IPTUNNEL_XMIT(tstats, &dev->stats);
 911        return NETDEV_TX_OK;
 912
 913tx_error_icmp:
 914        dst_link_failure(skb);
 915
 916tx_error:
 917        dev->stats.tx_errors++;
 918        dev_kfree_skb(skb);
 919        return NETDEV_TX_OK;
 920}
 921
 922static int ipgre_tunnel_bind_dev(struct net_device *dev)
 923{
 924        struct net_device *tdev = NULL;
 925        struct ip_tunnel *tunnel;
 926        const struct iphdr *iph;
 927        int hlen = LL_MAX_HEADER;
 928        int mtu = ETH_DATA_LEN;
 929        int addend = sizeof(struct iphdr) + 4;
 930
 931        tunnel = netdev_priv(dev);
 932        iph = &tunnel->parms.iph;
 933
 934        /* Guess output device to choose reasonable mtu and needed_headroom */
 935
 936        if (iph->daddr) {
 937                struct flowi4 fl4;
 938                struct rtable *rt;
 939
 940                rt = ip_route_output_gre(dev_net(dev), &fl4,
 941                                         iph->daddr, iph->saddr,
 942                                         tunnel->parms.o_key,
 943                                         RT_TOS(iph->tos),
 944                                         tunnel->parms.link);
 945                if (!IS_ERR(rt)) {
 946                        tdev = rt->dst.dev;
 947                        ip_rt_put(rt);
 948                }
 949
 950                if (dev->type != ARPHRD_ETHER)
 951                        dev->flags |= IFF_POINTOPOINT;
 952        }
 953
 954        if (!tdev && tunnel->parms.link)
 955                tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
 956
 957        if (tdev) {
 958                hlen = tdev->hard_header_len + tdev->needed_headroom;
 959                mtu = tdev->mtu;
 960        }
 961        dev->iflink = tunnel->parms.link;
 962
 963        /* Precalculate GRE options length */
 964        if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
 965                if (tunnel->parms.o_flags&GRE_CSUM)
 966                        addend += 4;
 967                if (tunnel->parms.o_flags&GRE_KEY)
 968                        addend += 4;
 969                if (tunnel->parms.o_flags&GRE_SEQ)
 970                        addend += 4;
 971        }
 972        dev->needed_headroom = addend + hlen;
 973        mtu -= dev->hard_header_len + addend;
 974
 975        if (mtu < 68)
 976                mtu = 68;
 977
 978        tunnel->hlen = addend;
 979
 980        return mtu;
 981}
 982
 983static int
 984ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 985{
 986        int err = 0;
 987        struct ip_tunnel_parm p;
 988        struct ip_tunnel *t;
 989        struct net *net = dev_net(dev);
 990        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 991
 992        switch (cmd) {
 993        case SIOCGETTUNNEL:
 994                t = NULL;
 995                if (dev == ign->fb_tunnel_dev) {
 996                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
 997                                err = -EFAULT;
 998                                break;
 999                        }
1000                        t = ipgre_tunnel_locate(net, &p, 0);
1001                }
1002                if (t == NULL)
1003                        t = netdev_priv(dev);
1004                memcpy(&p, &t->parms, sizeof(p));
1005                if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1006                        err = -EFAULT;
1007                break;
1008
1009        case SIOCADDTUNNEL:
1010        case SIOCCHGTUNNEL:
1011                err = -EPERM;
1012                if (!capable(CAP_NET_ADMIN))
1013                        goto done;
1014
1015                err = -EFAULT;
1016                if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1017                        goto done;
1018
1019                err = -EINVAL;
1020                if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1021                    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1022                    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1023                        goto done;
1024                if (p.iph.ttl)
1025                        p.iph.frag_off |= htons(IP_DF);
1026
1027                if (!(p.i_flags&GRE_KEY))
1028                        p.i_key = 0;
1029                if (!(p.o_flags&GRE_KEY))
1030                        p.o_key = 0;
1031
1032                t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1033
1034                if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1035                        if (t != NULL) {
1036                                if (t->dev != dev) {
1037                                        err = -EEXIST;
1038                                        break;
1039                                }
1040                        } else {
1041                                unsigned int nflags = 0;
1042
1043                                t = netdev_priv(dev);
1044
1045                                if (ipv4_is_multicast(p.iph.daddr))
1046                                        nflags = IFF_BROADCAST;
1047                                else if (p.iph.daddr)
1048                                        nflags = IFF_POINTOPOINT;
1049
1050                                if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1051                                        err = -EINVAL;
1052                                        break;
1053                                }
1054                                ipgre_tunnel_unlink(ign, t);
1055                                synchronize_net();
1056                                t->parms.iph.saddr = p.iph.saddr;
1057                                t->parms.iph.daddr = p.iph.daddr;
1058                                t->parms.i_key = p.i_key;
1059                                t->parms.o_key = p.o_key;
1060                                memcpy(dev->dev_addr, &p.iph.saddr, 4);
1061                                memcpy(dev->broadcast, &p.iph.daddr, 4);
1062                                ipgre_tunnel_link(ign, t);
1063                                netdev_state_change(dev);
1064                        }
1065                }
1066
1067                if (t) {
1068                        err = 0;
1069                        if (cmd == SIOCCHGTUNNEL) {
1070                                t->parms.iph.ttl = p.iph.ttl;
1071                                t->parms.iph.tos = p.iph.tos;
1072                                t->parms.iph.frag_off = p.iph.frag_off;
1073                                if (t->parms.link != p.link) {
1074                                        t->parms.link = p.link;
1075                                        dev->mtu = ipgre_tunnel_bind_dev(dev);
1076                                        netdev_state_change(dev);
1077                                }
1078                        }
1079                        if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1080                                err = -EFAULT;
1081                } else
1082                        err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1083                break;
1084
1085        case SIOCDELTUNNEL:
1086                err = -EPERM;
1087                if (!capable(CAP_NET_ADMIN))
1088                        goto done;
1089
1090                if (dev == ign->fb_tunnel_dev) {
1091                        err = -EFAULT;
1092                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1093                                goto done;
1094                        err = -ENOENT;
1095                        if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1096                                goto done;
1097                        err = -EPERM;
1098                        if (t == netdev_priv(ign->fb_tunnel_dev))
1099                                goto done;
1100                        dev = t->dev;
1101                }
1102                unregister_netdevice(dev);
1103                err = 0;
1104                break;
1105
1106        default:
1107                err = -EINVAL;
1108        }
1109
1110done:
1111        return err;
1112}
1113
1114static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1115{
1116        struct ip_tunnel *tunnel = netdev_priv(dev);
1117        if (new_mtu < 68 ||
1118            new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1119                return -EINVAL;
1120        dev->mtu = new_mtu;
1121        return 0;
1122}
1123
1124/* Nice toy. Unfortunately, useless in real life :-)
1125   It allows to construct virtual multiprotocol broadcast "LAN"
1126   over the Internet, provided multicast routing is tuned.
1127
1128
1129   I have no idea was this bicycle invented before me,
1130   so that I had to set ARPHRD_IPGRE to a random value.
1131   I have an impression, that Cisco could make something similar,
1132   but this feature is apparently missing in IOS<=11.2(8).
1133
1134   I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1135   with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1136
1137   ping -t 255 224.66.66.66
1138
1139   If nobody answers, mbone does not work.
1140
1141   ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1142   ip addr add 10.66.66.<somewhat>/24 dev Universe
1143   ifconfig Universe up
1144   ifconfig Universe add fe80::<Your_real_addr>/10
1145   ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1146   ftp 10.66.66.66
1147   ...
1148   ftp fec0:6666:6666::193.233.7.65
1149   ...
1150
1151 */
1152
1153static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1154                        unsigned short type,
1155                        const void *daddr, const void *saddr, unsigned int len)
1156{
1157        struct ip_tunnel *t = netdev_priv(dev);
1158        struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1159        __be16 *p = (__be16*)(iph+1);
1160
1161        memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1162        p[0]            = t->parms.o_flags;
1163        p[1]            = htons(type);
1164
1165        /*
1166         *      Set the source hardware address.
1167         */
1168
1169        if (saddr)
1170                memcpy(&iph->saddr, saddr, 4);
1171        if (daddr)
1172                memcpy(&iph->daddr, daddr, 4);
1173        if (iph->daddr)
1174                return t->hlen;
1175
1176        return -t->hlen;
1177}
1178
1179static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1180{
1181        const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1182        memcpy(haddr, &iph->saddr, 4);
1183        return 4;
1184}
1185
1186static const struct header_ops ipgre_header_ops = {
1187        .create = ipgre_header,
1188        .parse  = ipgre_header_parse,
1189};
1190
1191#ifdef CONFIG_NET_IPGRE_BROADCAST
1192static int ipgre_open(struct net_device *dev)
1193{
1194        struct ip_tunnel *t = netdev_priv(dev);
1195
1196        if (ipv4_is_multicast(t->parms.iph.daddr)) {
1197                struct flowi4 fl4;
1198                struct rtable *rt;
1199
1200                rt = ip_route_output_gre(dev_net(dev), &fl4,
1201                                         t->parms.iph.daddr,
1202                                         t->parms.iph.saddr,
1203                                         t->parms.o_key,
1204                                         RT_TOS(t->parms.iph.tos),
1205                                         t->parms.link);
1206                if (IS_ERR(rt))
1207                        return -EADDRNOTAVAIL;
1208                dev = rt->dst.dev;
1209                ip_rt_put(rt);
1210                if (__in_dev_get_rtnl(dev) == NULL)
1211                        return -EADDRNOTAVAIL;
1212                t->mlink = dev->ifindex;
1213                ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1214        }
1215        return 0;
1216}
1217
1218static int ipgre_close(struct net_device *dev)
1219{
1220        struct ip_tunnel *t = netdev_priv(dev);
1221
1222        if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1223                struct in_device *in_dev;
1224                in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1225                if (in_dev)
1226                        ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1227        }
1228        return 0;
1229}
1230
1231#endif
1232
1233static const struct net_device_ops ipgre_netdev_ops = {
1234        .ndo_init               = ipgre_tunnel_init,
1235        .ndo_uninit             = ipgre_tunnel_uninit,
1236#ifdef CONFIG_NET_IPGRE_BROADCAST
1237        .ndo_open               = ipgre_open,
1238        .ndo_stop               = ipgre_close,
1239#endif
1240        .ndo_start_xmit         = ipgre_tunnel_xmit,
1241        .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1242        .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1243        .ndo_get_stats          = ipgre_get_stats,
1244};
1245
1246static void ipgre_dev_free(struct net_device *dev)
1247{
1248        free_percpu(dev->tstats);
1249        free_netdev(dev);
1250}
1251
1252static void ipgre_tunnel_setup(struct net_device *dev)
1253{
1254        dev->netdev_ops         = &ipgre_netdev_ops;
1255        dev->destructor         = ipgre_dev_free;
1256
1257        dev->type               = ARPHRD_IPGRE;
1258        dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1259        dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1260        dev->flags              = IFF_NOARP;
1261        dev->iflink             = 0;
1262        dev->addr_len           = 4;
1263        dev->features           |= NETIF_F_NETNS_LOCAL;
1264        dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1265}
1266
1267static int ipgre_tunnel_init(struct net_device *dev)
1268{
1269        struct ip_tunnel *tunnel;
1270        struct iphdr *iph;
1271
1272        tunnel = netdev_priv(dev);
1273        iph = &tunnel->parms.iph;
1274
1275        tunnel->dev = dev;
1276        strcpy(tunnel->parms.name, dev->name);
1277
1278        memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1279        memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1280
1281        if (iph->daddr) {
1282#ifdef CONFIG_NET_IPGRE_BROADCAST
1283                if (ipv4_is_multicast(iph->daddr)) {
1284                        if (!iph->saddr)
1285                                return -EINVAL;
1286                        dev->flags = IFF_BROADCAST;
1287                        dev->header_ops = &ipgre_header_ops;
1288                }
1289#endif
1290        } else
1291                dev->header_ops = &ipgre_header_ops;
1292
1293        dev->tstats = alloc_percpu(struct pcpu_tstats);
1294        if (!dev->tstats)
1295                return -ENOMEM;
1296
1297        return 0;
1298}
1299
1300static void ipgre_fb_tunnel_init(struct net_device *dev)
1301{
1302        struct ip_tunnel *tunnel = netdev_priv(dev);
1303        struct iphdr *iph = &tunnel->parms.iph;
1304
1305        tunnel->dev = dev;
1306        strcpy(tunnel->parms.name, dev->name);
1307
1308        iph->version            = 4;
1309        iph->protocol           = IPPROTO_GRE;
1310        iph->ihl                = 5;
1311        tunnel->hlen            = sizeof(struct iphdr) + 4;
1312
1313        dev_hold(dev);
1314}
1315
1316
1317static const struct gre_protocol ipgre_protocol = {
1318        .handler     = ipgre_rcv,
1319        .err_handler = ipgre_err,
1320};
1321
1322static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1323{
1324        int prio;
1325
1326        for (prio = 0; prio < 4; prio++) {
1327                int h;
1328                for (h = 0; h < HASH_SIZE; h++) {
1329                        struct ip_tunnel *t;
1330
1331                        t = rtnl_dereference(ign->tunnels[prio][h]);
1332
1333                        while (t != NULL) {
1334                                unregister_netdevice_queue(t->dev, head);
1335                                t = rtnl_dereference(t->next);
1336                        }
1337                }
1338        }
1339}
1340
1341static int __net_init ipgre_init_net(struct net *net)
1342{
1343        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1344        int err;
1345
1346        ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1347                                           ipgre_tunnel_setup);
1348        if (!ign->fb_tunnel_dev) {
1349                err = -ENOMEM;
1350                goto err_alloc_dev;
1351        }
1352        dev_net_set(ign->fb_tunnel_dev, net);
1353
1354        ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1355        ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1356
1357        if ((err = register_netdev(ign->fb_tunnel_dev)))
1358                goto err_reg_dev;
1359
1360        rcu_assign_pointer(ign->tunnels_wc[0],
1361                           netdev_priv(ign->fb_tunnel_dev));
1362        return 0;
1363
1364err_reg_dev:
1365        ipgre_dev_free(ign->fb_tunnel_dev);
1366err_alloc_dev:
1367        return err;
1368}
1369
1370static void __net_exit ipgre_exit_net(struct net *net)
1371{
1372        struct ipgre_net *ign;
1373        LIST_HEAD(list);
1374
1375        ign = net_generic(net, ipgre_net_id);
1376        rtnl_lock();
1377        ipgre_destroy_tunnels(ign, &list);
1378        unregister_netdevice_many(&list);
1379        rtnl_unlock();
1380}
1381
1382static struct pernet_operations ipgre_net_ops = {
1383        .init = ipgre_init_net,
1384        .exit = ipgre_exit_net,
1385        .id   = &ipgre_net_id,
1386        .size = sizeof(struct ipgre_net),
1387};
1388
1389static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1390{
1391        __be16 flags;
1392
1393        if (!data)
1394                return 0;
1395
1396        flags = 0;
1397        if (data[IFLA_GRE_IFLAGS])
1398                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1399        if (data[IFLA_GRE_OFLAGS])
1400                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1401        if (flags & (GRE_VERSION|GRE_ROUTING))
1402                return -EINVAL;
1403
1404        return 0;
1405}
1406
1407static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1408{
1409        __be32 daddr;
1410
1411        if (tb[IFLA_ADDRESS]) {
1412                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1413                        return -EINVAL;
1414                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1415                        return -EADDRNOTAVAIL;
1416        }
1417
1418        if (!data)
1419                goto out;
1420
1421        if (data[IFLA_GRE_REMOTE]) {
1422                memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1423                if (!daddr)
1424                        return -EINVAL;
1425        }
1426
1427out:
1428        return ipgre_tunnel_validate(tb, data);
1429}
1430
1431static void ipgre_netlink_parms(struct nlattr *data[],
1432                                struct ip_tunnel_parm *parms)
1433{
1434        memset(parms, 0, sizeof(*parms));
1435
1436        parms->iph.protocol = IPPROTO_GRE;
1437
1438        if (!data)
1439                return;
1440
1441        if (data[IFLA_GRE_LINK])
1442                parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1443
1444        if (data[IFLA_GRE_IFLAGS])
1445                parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1446
1447        if (data[IFLA_GRE_OFLAGS])
1448                parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1449
1450        if (data[IFLA_GRE_IKEY])
1451                parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1452
1453        if (data[IFLA_GRE_OKEY])
1454                parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1455
1456        if (data[IFLA_GRE_LOCAL])
1457                parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1458
1459        if (data[IFLA_GRE_REMOTE])
1460                parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1461
1462        if (data[IFLA_GRE_TTL])
1463                parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1464
1465        if (data[IFLA_GRE_TOS])
1466                parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1467
1468        if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1469                parms->iph.frag_off = htons(IP_DF);
1470}
1471
1472static int ipgre_tap_init(struct net_device *dev)
1473{
1474        struct ip_tunnel *tunnel;
1475
1476        tunnel = netdev_priv(dev);
1477
1478        tunnel->dev = dev;
1479        strcpy(tunnel->parms.name, dev->name);
1480
1481        ipgre_tunnel_bind_dev(dev);
1482
1483        dev->tstats = alloc_percpu(struct pcpu_tstats);
1484        if (!dev->tstats)
1485                return -ENOMEM;
1486
1487        return 0;
1488}
1489
1490static const struct net_device_ops ipgre_tap_netdev_ops = {
1491        .ndo_init               = ipgre_tap_init,
1492        .ndo_uninit             = ipgre_tunnel_uninit,
1493        .ndo_start_xmit         = ipgre_tunnel_xmit,
1494        .ndo_set_mac_address    = eth_mac_addr,
1495        .ndo_validate_addr      = eth_validate_addr,
1496        .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1497        .ndo_get_stats          = ipgre_get_stats,
1498};
1499
1500static void ipgre_tap_setup(struct net_device *dev)
1501{
1502
1503        ether_setup(dev);
1504
1505        dev->netdev_ops         = &ipgre_tap_netdev_ops;
1506        dev->destructor         = ipgre_dev_free;
1507
1508        dev->iflink             = 0;
1509        dev->features           |= NETIF_F_NETNS_LOCAL;
1510}
1511
1512static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1513                         struct nlattr *data[])
1514{
1515        struct ip_tunnel *nt;
1516        struct net *net = dev_net(dev);
1517        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1518        int mtu;
1519        int err;
1520
1521        nt = netdev_priv(dev);
1522        ipgre_netlink_parms(data, &nt->parms);
1523
1524        if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1525                return -EEXIST;
1526
1527        if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1528                random_ether_addr(dev->dev_addr);
1529
1530        mtu = ipgre_tunnel_bind_dev(dev);
1531        if (!tb[IFLA_MTU])
1532                dev->mtu = mtu;
1533
1534        /* Can use a lockless transmit, unless we generate output sequences */
1535        if (!(nt->parms.o_flags & GRE_SEQ))
1536                dev->features |= NETIF_F_LLTX;
1537
1538        err = register_netdevice(dev);
1539        if (err)
1540                goto out;
1541
1542        dev_hold(dev);
1543        ipgre_tunnel_link(ign, nt);
1544
1545out:
1546        return err;
1547}
1548
1549static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1550                            struct nlattr *data[])
1551{
1552        struct ip_tunnel *t, *nt;
1553        struct net *net = dev_net(dev);
1554        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1555        struct ip_tunnel_parm p;
1556        int mtu;
1557
1558        if (dev == ign->fb_tunnel_dev)
1559                return -EINVAL;
1560
1561        nt = netdev_priv(dev);
1562        ipgre_netlink_parms(data, &p);
1563
1564        t = ipgre_tunnel_locate(net, &p, 0);
1565
1566        if (t) {
1567                if (t->dev != dev)
1568                        return -EEXIST;
1569        } else {
1570                t = nt;
1571
1572                if (dev->type != ARPHRD_ETHER) {
1573                        unsigned int nflags = 0;
1574
1575                        if (ipv4_is_multicast(p.iph.daddr))
1576                                nflags = IFF_BROADCAST;
1577                        else if (p.iph.daddr)
1578                                nflags = IFF_POINTOPOINT;
1579
1580                        if ((dev->flags ^ nflags) &
1581                            (IFF_POINTOPOINT | IFF_BROADCAST))
1582                                return -EINVAL;
1583                }
1584
1585                ipgre_tunnel_unlink(ign, t);
1586                t->parms.iph.saddr = p.iph.saddr;
1587                t->parms.iph.daddr = p.iph.daddr;
1588                t->parms.i_key = p.i_key;
1589                if (dev->type != ARPHRD_ETHER) {
1590                        memcpy(dev->dev_addr, &p.iph.saddr, 4);
1591                        memcpy(dev->broadcast, &p.iph.daddr, 4);
1592                }
1593                ipgre_tunnel_link(ign, t);
1594                netdev_state_change(dev);
1595        }
1596
1597        t->parms.o_key = p.o_key;
1598        t->parms.iph.ttl = p.iph.ttl;
1599        t->parms.iph.tos = p.iph.tos;
1600        t->parms.iph.frag_off = p.iph.frag_off;
1601
1602        if (t->parms.link != p.link) {
1603                t->parms.link = p.link;
1604                mtu = ipgre_tunnel_bind_dev(dev);
1605                if (!tb[IFLA_MTU])
1606                        dev->mtu = mtu;
1607                netdev_state_change(dev);
1608        }
1609
1610        return 0;
1611}
1612
1613static size_t ipgre_get_size(const struct net_device *dev)
1614{
1615        return
1616                /* IFLA_GRE_LINK */
1617                nla_total_size(4) +
1618                /* IFLA_GRE_IFLAGS */
1619                nla_total_size(2) +
1620                /* IFLA_GRE_OFLAGS */
1621                nla_total_size(2) +
1622                /* IFLA_GRE_IKEY */
1623                nla_total_size(4) +
1624                /* IFLA_GRE_OKEY */
1625                nla_total_size(4) +
1626                /* IFLA_GRE_LOCAL */
1627                nla_total_size(4) +
1628                /* IFLA_GRE_REMOTE */
1629                nla_total_size(4) +
1630                /* IFLA_GRE_TTL */
1631                nla_total_size(1) +
1632                /* IFLA_GRE_TOS */
1633                nla_total_size(1) +
1634                /* IFLA_GRE_PMTUDISC */
1635                nla_total_size(1) +
1636                0;
1637}
1638
1639static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1640{
1641        struct ip_tunnel *t = netdev_priv(dev);
1642        struct ip_tunnel_parm *p = &t->parms;
1643
1644        NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1645        NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1646        NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1647        NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1648        NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1649        NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1650        NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1651        NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1652        NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1653        NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1654
1655        return 0;
1656
1657nla_put_failure:
1658        return -EMSGSIZE;
1659}
1660
1661static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1662        [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1663        [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1664        [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1665        [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1666        [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1667        [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1668        [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1669        [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1670        [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1671        [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1672};
1673
1674static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1675        .kind           = "gre",
1676        .maxtype        = IFLA_GRE_MAX,
1677        .policy         = ipgre_policy,
1678        .priv_size      = sizeof(struct ip_tunnel),
1679        .setup          = ipgre_tunnel_setup,
1680        .validate       = ipgre_tunnel_validate,
1681        .newlink        = ipgre_newlink,
1682        .changelink     = ipgre_changelink,
1683        .get_size       = ipgre_get_size,
1684        .fill_info      = ipgre_fill_info,
1685};
1686
1687static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1688        .kind           = "gretap",
1689        .maxtype        = IFLA_GRE_MAX,
1690        .policy         = ipgre_policy,
1691        .priv_size      = sizeof(struct ip_tunnel),
1692        .setup          = ipgre_tap_setup,
1693        .validate       = ipgre_tap_validate,
1694        .newlink        = ipgre_newlink,
1695        .changelink     = ipgre_changelink,
1696        .get_size       = ipgre_get_size,
1697        .fill_info      = ipgre_fill_info,
1698};
1699
1700/*
1701 *      And now the modules code and kernel interface.
1702 */
1703
1704static int __init ipgre_init(void)
1705{
1706        int err;
1707
1708        printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1709
1710        err = register_pernet_device(&ipgre_net_ops);
1711        if (err < 0)
1712                return err;
1713
1714        err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1715        if (err < 0) {
1716                printk(KERN_INFO "ipgre init: can't add protocol\n");
1717                goto add_proto_failed;
1718        }
1719
1720        err = rtnl_link_register(&ipgre_link_ops);
1721        if (err < 0)
1722                goto rtnl_link_failed;
1723
1724        err = rtnl_link_register(&ipgre_tap_ops);
1725        if (err < 0)
1726                goto tap_ops_failed;
1727
1728out:
1729        return err;
1730
1731tap_ops_failed:
1732        rtnl_link_unregister(&ipgre_link_ops);
1733rtnl_link_failed:
1734        gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1735add_proto_failed:
1736        unregister_pernet_device(&ipgre_net_ops);
1737        goto out;
1738}
1739
1740static void __exit ipgre_fini(void)
1741{
1742        rtnl_link_unregister(&ipgre_tap_ops);
1743        rtnl_link_unregister(&ipgre_link_ops);
1744        if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1745                printk(KERN_INFO "ipgre close: can't remove protocol\n");
1746        unregister_pernet_device(&ipgre_net_ops);
1747}
1748
1749module_init(ipgre_init);
1750module_exit(ipgre_fini);
1751MODULE_LICENSE("GPL");
1752MODULE_ALIAS_RTNL_LINK("gre");
1753MODULE_ALIAS_RTNL_LINK("gretap");
1754MODULE_ALIAS_NETDEV("gre0");
1755