linux/net/ipv4/ip_gre.c
<<
>>
Prefs
   1/*
   2 *      Linux NET3:     GRE over IP protocol decoder.
   3 *
   4 *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
   5 *
   6 *      This program is free software; you can redistribute it and/or
   7 *      modify it under the terms of the GNU General Public License
   8 *      as published by the Free Software Foundation; either version
   9 *      2 of the License, or (at your option) any later version.
  10 *
  11 */
  12
  13#include <linux/capability.h>
  14#include <linux/module.h>
  15#include <linux/types.h>
  16#include <linux/kernel.h>
  17#include <linux/slab.h>
  18#include <asm/uaccess.h>
  19#include <linux/skbuff.h>
  20#include <linux/netdevice.h>
  21#include <linux/in.h>
  22#include <linux/tcp.h>
  23#include <linux/udp.h>
  24#include <linux/if_arp.h>
  25#include <linux/mroute.h>
  26#include <linux/init.h>
  27#include <linux/in6.h>
  28#include <linux/inetdevice.h>
  29#include <linux/igmp.h>
  30#include <linux/netfilter_ipv4.h>
  31#include <linux/etherdevice.h>
  32#include <linux/if_ether.h>
  33
  34#include <net/sock.h>
  35#include <net/ip.h>
  36#include <net/icmp.h>
  37#include <net/protocol.h>
  38#include <net/ipip.h>
  39#include <net/arp.h>
  40#include <net/checksum.h>
  41#include <net/dsfield.h>
  42#include <net/inet_ecn.h>
  43#include <net/xfrm.h>
  44#include <net/net_namespace.h>
  45#include <net/netns/generic.h>
  46#include <net/rtnetlink.h>
  47#include <net/gre.h>
  48
  49#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
  50#include <net/ipv6.h>
  51#include <net/ip6_fib.h>
  52#include <net/ip6_route.h>
  53#endif
  54
  55/*
  56   Problems & solutions
  57   --------------------
  58
  59   1. The most important issue is detecting local dead loops.
  60   They would cause complete host lockup in transmit, which
  61   would be "resolved" by stack overflow or, if queueing is enabled,
  62   with infinite looping in net_bh.
  63
  64   We cannot track such dead loops during route installation,
  65   it is infeasible task. The most general solutions would be
  66   to keep skb->encapsulation counter (sort of local ttl),
  67   and silently drop packet when it expires. It is a good
  68   solution, but it supposes maintaing new variable in ALL
  69   skb, even if no tunneling is used.
  70
  71   Current solution: xmit_recursion breaks dead loops. This is a percpu
  72   counter, since when we enter the first ndo_xmit(), cpu migration is
  73   forbidden. We force an exit if this counter reaches RECURSION_LIMIT
  74
  75   2. Networking dead loops would not kill routers, but would really
  76   kill network. IP hop limit plays role of "t->recursion" in this case,
  77   if we copy it from packet being encapsulated to upper header.
  78   It is very good solution, but it introduces two problems:
  79
  80   - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
  81     do not work over tunnels.
  82   - traceroute does not work. I planned to relay ICMP from tunnel,
  83     so that this problem would be solved and traceroute output
  84     would even more informative. This idea appeared to be wrong:
  85     only Linux complies to rfc1812 now (yes, guys, Linux is the only
  86     true router now :-)), all routers (at least, in neighbourhood of mine)
  87     return only 8 bytes of payload. It is the end.
  88
  89   Hence, if we want that OSPF worked or traceroute said something reasonable,
  90   we should search for another solution.
  91
  92   One of them is to parse packet trying to detect inner encapsulation
  93   made by our node. It is difficult or even impossible, especially,
  94   taking into account fragmentation. TO be short, tt is not solution at all.
  95
  96   Current solution: The solution was UNEXPECTEDLY SIMPLE.
  97   We force DF flag on tunnels with preconfigured hop limit,
  98   that is ALL. :-) Well, it does not remove the problem completely,
  99   but exponential growth of network traffic is changed to linear
 100   (branches, that exceed pmtu are pruned) and tunnel mtu
 101   fastly degrades to value <68, where looping stops.
 102   Yes, it is not good if there exists a router in the loop,
 103   which does not force DF, even when encapsulating packets have DF set.
 104   But it is not our problem! Nobody could accuse us, we made
 105   all that we could make. Even if it is your gated who injected
 106   fatal route to network, even if it were you who configured
 107   fatal static route: you are innocent. :-)
 108
 109
 110
 111   3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
 112   practically identical code. It would be good to glue them
 113   together, but it is not very evident, how to make them modular.
 114   sit is integral part of IPv6, ipip and gre are naturally modular.
 115   We could extract common parts (hash table, ioctl etc)
 116   to a separate module (ip_tunnel.c).
 117
 118   Alexey Kuznetsov.
 119 */
 120
 121static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 122static int ipgre_tunnel_init(struct net_device *dev);
 123static void ipgre_tunnel_setup(struct net_device *dev);
 124static int ipgre_tunnel_bind_dev(struct net_device *dev);
 125
 126/* Fallback tunnel: no source, no destination, no key, no options */
 127
 128#define HASH_SIZE  16
 129
 130static int ipgre_net_id __read_mostly;
 131struct ipgre_net {
 132        struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
 133
 134        struct net_device *fb_tunnel_dev;
 135};
 136
 137/* Tunnel hash table */
 138
 139/*
 140   4 hash tables:
 141
 142   3: (remote,local)
 143   2: (remote,*)
 144   1: (*,local)
 145   0: (*,*)
 146
 147   We require exact key match i.e. if a key is present in packet
 148   it will match only tunnel with the same key; if it is not present,
 149   it will match only keyless tunnel.
 150
 151   All keysless packets, if not matched configured keyless tunnels
 152   will match fallback tunnel.
 153 */
 154
 155#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
 156
 157#define tunnels_r_l     tunnels[3]
 158#define tunnels_r       tunnels[2]
 159#define tunnels_l       tunnels[1]
 160#define tunnels_wc      tunnels[0]
 161/*
 162 * Locking : hash tables are protected by RCU and RTNL
 163 */
 164
 165#define for_each_ip_tunnel_rcu(start) \
 166        for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
 167
 168/* often modified stats are per cpu, other are shared (netdev->stats) */
 169struct pcpu_tstats {
 170        unsigned long   rx_packets;
 171        unsigned long   rx_bytes;
 172        unsigned long   tx_packets;
 173        unsigned long   tx_bytes;
 174};
 175
 176static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
 177{
 178        struct pcpu_tstats sum = { 0 };
 179        int i;
 180
 181        for_each_possible_cpu(i) {
 182                const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
 183
 184                sum.rx_packets += tstats->rx_packets;
 185                sum.rx_bytes   += tstats->rx_bytes;
 186                sum.tx_packets += tstats->tx_packets;
 187                sum.tx_bytes   += tstats->tx_bytes;
 188        }
 189        dev->stats.rx_packets = sum.rx_packets;
 190        dev->stats.rx_bytes   = sum.rx_bytes;
 191        dev->stats.tx_packets = sum.tx_packets;
 192        dev->stats.tx_bytes   = sum.tx_bytes;
 193        return &dev->stats;
 194}
 195
 196/* Given src, dst and key, find appropriate for input tunnel. */
 197
 198static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
 199                                              __be32 remote, __be32 local,
 200                                              __be32 key, __be16 gre_proto)
 201{
 202        struct net *net = dev_net(dev);
 203        int link = dev->ifindex;
 204        unsigned int h0 = HASH(remote);
 205        unsigned int h1 = HASH(key);
 206        struct ip_tunnel *t, *cand = NULL;
 207        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 208        int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
 209                       ARPHRD_ETHER : ARPHRD_IPGRE;
 210        int score, cand_score = 4;
 211
 212        for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
 213                if (local != t->parms.iph.saddr ||
 214                    remote != t->parms.iph.daddr ||
 215                    key != t->parms.i_key ||
 216                    !(t->dev->flags & IFF_UP))
 217                        continue;
 218
 219                if (t->dev->type != ARPHRD_IPGRE &&
 220                    t->dev->type != dev_type)
 221                        continue;
 222
 223                score = 0;
 224                if (t->parms.link != link)
 225                        score |= 1;
 226                if (t->dev->type != dev_type)
 227                        score |= 2;
 228                if (score == 0)
 229                        return t;
 230
 231                if (score < cand_score) {
 232                        cand = t;
 233                        cand_score = score;
 234                }
 235        }
 236
 237        for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
 238                if (remote != t->parms.iph.daddr ||
 239                    key != t->parms.i_key ||
 240                    !(t->dev->flags & IFF_UP))
 241                        continue;
 242
 243                if (t->dev->type != ARPHRD_IPGRE &&
 244                    t->dev->type != dev_type)
 245                        continue;
 246
 247                score = 0;
 248                if (t->parms.link != link)
 249                        score |= 1;
 250                if (t->dev->type != dev_type)
 251                        score |= 2;
 252                if (score == 0)
 253                        return t;
 254
 255                if (score < cand_score) {
 256                        cand = t;
 257                        cand_score = score;
 258                }
 259        }
 260
 261        for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
 262                if ((local != t->parms.iph.saddr &&
 263                     (local != t->parms.iph.daddr ||
 264                      !ipv4_is_multicast(local))) ||
 265                    key != t->parms.i_key ||
 266                    !(t->dev->flags & IFF_UP))
 267                        continue;
 268
 269                if (t->dev->type != ARPHRD_IPGRE &&
 270                    t->dev->type != dev_type)
 271                        continue;
 272
 273                score = 0;
 274                if (t->parms.link != link)
 275                        score |= 1;
 276                if (t->dev->type != dev_type)
 277                        score |= 2;
 278                if (score == 0)
 279                        return t;
 280
 281                if (score < cand_score) {
 282                        cand = t;
 283                        cand_score = score;
 284                }
 285        }
 286
 287        for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
 288                if (t->parms.i_key != key ||
 289                    !(t->dev->flags & IFF_UP))
 290                        continue;
 291
 292                if (t->dev->type != ARPHRD_IPGRE &&
 293                    t->dev->type != dev_type)
 294                        continue;
 295
 296                score = 0;
 297                if (t->parms.link != link)
 298                        score |= 1;
 299                if (t->dev->type != dev_type)
 300                        score |= 2;
 301                if (score == 0)
 302                        return t;
 303
 304                if (score < cand_score) {
 305                        cand = t;
 306                        cand_score = score;
 307                }
 308        }
 309
 310        if (cand != NULL)
 311                return cand;
 312
 313        dev = ign->fb_tunnel_dev;
 314        if (dev->flags & IFF_UP)
 315                return netdev_priv(dev);
 316
 317        return NULL;
 318}
 319
 320static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
 321                struct ip_tunnel_parm *parms)
 322{
 323        __be32 remote = parms->iph.daddr;
 324        __be32 local = parms->iph.saddr;
 325        __be32 key = parms->i_key;
 326        unsigned int h = HASH(key);
 327        int prio = 0;
 328
 329        if (local)
 330                prio |= 1;
 331        if (remote && !ipv4_is_multicast(remote)) {
 332                prio |= 2;
 333                h ^= HASH(remote);
 334        }
 335
 336        return &ign->tunnels[prio][h];
 337}
 338
 339static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
 340                struct ip_tunnel *t)
 341{
 342        return __ipgre_bucket(ign, &t->parms);
 343}
 344
 345static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
 346{
 347        struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
 348
 349        rcu_assign_pointer(t->next, rtnl_dereference(*tp));
 350        rcu_assign_pointer(*tp, t);
 351}
 352
 353static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
 354{
 355        struct ip_tunnel __rcu **tp;
 356        struct ip_tunnel *iter;
 357
 358        for (tp = ipgre_bucket(ign, t);
 359             (iter = rtnl_dereference(*tp)) != NULL;
 360             tp = &iter->next) {
 361                if (t == iter) {
 362                        rcu_assign_pointer(*tp, t->next);
 363                        break;
 364                }
 365        }
 366}
 367
 368static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
 369                                           struct ip_tunnel_parm *parms,
 370                                           int type)
 371{
 372        __be32 remote = parms->iph.daddr;
 373        __be32 local = parms->iph.saddr;
 374        __be32 key = parms->i_key;
 375        int link = parms->link;
 376        struct ip_tunnel *t;
 377        struct ip_tunnel __rcu **tp;
 378        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 379
 380        for (tp = __ipgre_bucket(ign, parms);
 381             (t = rtnl_dereference(*tp)) != NULL;
 382             tp = &t->next)
 383                if (local == t->parms.iph.saddr &&
 384                    remote == t->parms.iph.daddr &&
 385                    key == t->parms.i_key &&
 386                    link == t->parms.link &&
 387                    type == t->dev->type)
 388                        break;
 389
 390        return t;
 391}
 392
 393static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
 394                struct ip_tunnel_parm *parms, int create)
 395{
 396        struct ip_tunnel *t, *nt;
 397        struct net_device *dev;
 398        char name[IFNAMSIZ];
 399        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 400
 401        t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
 402        if (t || !create)
 403                return t;
 404
 405        if (parms->name[0])
 406                strlcpy(name, parms->name, IFNAMSIZ);
 407        else
 408                strcpy(name, "gre%d");
 409
 410        dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
 411        if (!dev)
 412                return NULL;
 413
 414        dev_net_set(dev, net);
 415
 416        if (strchr(name, '%')) {
 417                if (dev_alloc_name(dev, name) < 0)
 418                        goto failed_free;
 419        }
 420
 421        nt = netdev_priv(dev);
 422        nt->parms = *parms;
 423        dev->rtnl_link_ops = &ipgre_link_ops;
 424
 425        dev->mtu = ipgre_tunnel_bind_dev(dev);
 426
 427        if (register_netdevice(dev) < 0)
 428                goto failed_free;
 429
 430        dev_hold(dev);
 431        ipgre_tunnel_link(ign, nt);
 432        return nt;
 433
 434failed_free:
 435        free_netdev(dev);
 436        return NULL;
 437}
 438
 439static void ipgre_tunnel_uninit(struct net_device *dev)
 440{
 441        struct net *net = dev_net(dev);
 442        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 443
 444        ipgre_tunnel_unlink(ign, netdev_priv(dev));
 445        dev_put(dev);
 446}
 447
 448
 449static void ipgre_err(struct sk_buff *skb, u32 info)
 450{
 451
 452/* All the routers (except for Linux) return only
 453   8 bytes of packet payload. It means, that precise relaying of
 454   ICMP in the real Internet is absolutely infeasible.
 455
 456   Moreover, Cisco "wise men" put GRE key to the third word
 457   in GRE header. It makes impossible maintaining even soft state for keyed
 458   GRE tunnels with enabled checksum. Tell them "thank you".
 459
 460   Well, I wonder, rfc1812 was written by Cisco employee,
 461   what the hell these idiots break standrads established
 462   by themself???
 463 */
 464
 465        struct iphdr *iph = (struct iphdr *)skb->data;
 466        __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
 467        int grehlen = (iph->ihl<<2) + 4;
 468        const int type = icmp_hdr(skb)->type;
 469        const int code = icmp_hdr(skb)->code;
 470        struct ip_tunnel *t;
 471        __be16 flags;
 472
 473        flags = p[0];
 474        if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
 475                if (flags&(GRE_VERSION|GRE_ROUTING))
 476                        return;
 477                if (flags&GRE_KEY) {
 478                        grehlen += 4;
 479                        if (flags&GRE_CSUM)
 480                                grehlen += 4;
 481                }
 482        }
 483
 484        /* If only 8 bytes returned, keyed message will be dropped here */
 485        if (skb_headlen(skb) < grehlen)
 486                return;
 487
 488        switch (type) {
 489        default:
 490        case ICMP_PARAMETERPROB:
 491                return;
 492
 493        case ICMP_DEST_UNREACH:
 494                switch (code) {
 495                case ICMP_SR_FAILED:
 496                case ICMP_PORT_UNREACH:
 497                        /* Impossible event. */
 498                        return;
 499                case ICMP_FRAG_NEEDED:
 500                        /* Soft state for pmtu is maintained by IP core. */
 501                        return;
 502                default:
 503                        /* All others are translated to HOST_UNREACH.
 504                           rfc2003 contains "deep thoughts" about NET_UNREACH,
 505                           I believe they are just ether pollution. --ANK
 506                         */
 507                        break;
 508                }
 509                break;
 510        case ICMP_TIME_EXCEEDED:
 511                if (code != ICMP_EXC_TTL)
 512                        return;
 513                break;
 514        }
 515
 516        rcu_read_lock();
 517        t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
 518                                flags & GRE_KEY ?
 519                                *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
 520                                p[1]);
 521        if (t == NULL || t->parms.iph.daddr == 0 ||
 522            ipv4_is_multicast(t->parms.iph.daddr))
 523                goto out;
 524
 525        if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
 526                goto out;
 527
 528        if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
 529                t->err_count++;
 530        else
 531                t->err_count = 1;
 532        t->err_time = jiffies;
 533out:
 534        rcu_read_unlock();
 535}
 536
 537static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
 538{
 539        if (INET_ECN_is_ce(iph->tos)) {
 540                if (skb->protocol == htons(ETH_P_IP)) {
 541                        IP_ECN_set_ce(ip_hdr(skb));
 542                } else if (skb->protocol == htons(ETH_P_IPV6)) {
 543                        IP6_ECN_set_ce(ipv6_hdr(skb));
 544                }
 545        }
 546}
 547
 548static inline u8
 549ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
 550{
 551        u8 inner = 0;
 552        if (skb->protocol == htons(ETH_P_IP))
 553                inner = old_iph->tos;
 554        else if (skb->protocol == htons(ETH_P_IPV6))
 555                inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
 556        return INET_ECN_encapsulate(tos, inner);
 557}
 558
 559static int ipgre_rcv(struct sk_buff *skb)
 560{
 561        struct iphdr *iph;
 562        u8     *h;
 563        __be16    flags;
 564        __sum16   csum = 0;
 565        __be32 key = 0;
 566        u32    seqno = 0;
 567        struct ip_tunnel *tunnel;
 568        int    offset = 4;
 569        __be16 gre_proto;
 570
 571        if (!pskb_may_pull(skb, 16))
 572                goto drop_nolock;
 573
 574        iph = ip_hdr(skb);
 575        h = skb->data;
 576        flags = *(__be16*)h;
 577
 578        if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
 579                /* - Version must be 0.
 580                   - We do not support routing headers.
 581                 */
 582                if (flags&(GRE_VERSION|GRE_ROUTING))
 583                        goto drop_nolock;
 584
 585                if (flags&GRE_CSUM) {
 586                        switch (skb->ip_summed) {
 587                        case CHECKSUM_COMPLETE:
 588                                csum = csum_fold(skb->csum);
 589                                if (!csum)
 590                                        break;
 591                                /* fall through */
 592                        case CHECKSUM_NONE:
 593                                skb->csum = 0;
 594                                csum = __skb_checksum_complete(skb);
 595                                skb->ip_summed = CHECKSUM_COMPLETE;
 596                        }
 597                        offset += 4;
 598                }
 599                if (flags&GRE_KEY) {
 600                        key = *(__be32*)(h + offset);
 601                        offset += 4;
 602                }
 603                if (flags&GRE_SEQ) {
 604                        seqno = ntohl(*(__be32*)(h + offset));
 605                        offset += 4;
 606                }
 607        }
 608
 609        gre_proto = *(__be16 *)(h + 2);
 610
 611        rcu_read_lock();
 612        if ((tunnel = ipgre_tunnel_lookup(skb->dev,
 613                                          iph->saddr, iph->daddr, key,
 614                                          gre_proto))) {
 615                struct pcpu_tstats *tstats;
 616
 617                secpath_reset(skb);
 618
 619                skb->protocol = gre_proto;
 620                /* WCCP version 1 and 2 protocol decoding.
 621                 * - Change protocol to IP
 622                 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
 623                 */
 624                if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
 625                        skb->protocol = htons(ETH_P_IP);
 626                        if ((*(h + offset) & 0xF0) != 0x40)
 627                                offset += 4;
 628                }
 629
 630                skb->mac_header = skb->network_header;
 631                __pskb_pull(skb, offset);
 632                skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
 633                skb->pkt_type = PACKET_HOST;
 634#ifdef CONFIG_NET_IPGRE_BROADCAST
 635                if (ipv4_is_multicast(iph->daddr)) {
 636                        /* Looped back packet, drop it! */
 637                        if (rt_is_output_route(skb_rtable(skb)))
 638                                goto drop;
 639                        tunnel->dev->stats.multicast++;
 640                        skb->pkt_type = PACKET_BROADCAST;
 641                }
 642#endif
 643
 644                if (((flags&GRE_CSUM) && csum) ||
 645                    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
 646                        tunnel->dev->stats.rx_crc_errors++;
 647                        tunnel->dev->stats.rx_errors++;
 648                        goto drop;
 649                }
 650                if (tunnel->parms.i_flags&GRE_SEQ) {
 651                        if (!(flags&GRE_SEQ) ||
 652                            (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
 653                                tunnel->dev->stats.rx_fifo_errors++;
 654                                tunnel->dev->stats.rx_errors++;
 655                                goto drop;
 656                        }
 657                        tunnel->i_seqno = seqno + 1;
 658                }
 659
 660                /* Warning: All skb pointers will be invalidated! */
 661                if (tunnel->dev->type == ARPHRD_ETHER) {
 662                        if (!pskb_may_pull(skb, ETH_HLEN)) {
 663                                tunnel->dev->stats.rx_length_errors++;
 664                                tunnel->dev->stats.rx_errors++;
 665                                goto drop;
 666                        }
 667
 668                        iph = ip_hdr(skb);
 669                        skb->protocol = eth_type_trans(skb, tunnel->dev);
 670                        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 671                }
 672
 673                tstats = this_cpu_ptr(tunnel->dev->tstats);
 674                tstats->rx_packets++;
 675                tstats->rx_bytes += skb->len;
 676
 677                __skb_tunnel_rx(skb, tunnel->dev);
 678
 679                skb_reset_network_header(skb);
 680                ipgre_ecn_decapsulate(iph, skb);
 681
 682                netif_rx(skb);
 683
 684                rcu_read_unlock();
 685                return 0;
 686        }
 687        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 688
 689drop:
 690        rcu_read_unlock();
 691drop_nolock:
 692        kfree_skb(skb);
 693        return 0;
 694}
 695
 696static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 697{
 698        struct ip_tunnel *tunnel = netdev_priv(dev);
 699        struct pcpu_tstats *tstats;
 700        struct iphdr  *old_iph = ip_hdr(skb);
 701        struct iphdr  *tiph;
 702        u8     tos;
 703        __be16 df;
 704        struct rtable *rt;                      /* Route to the other host */
 705        struct net_device *tdev;                /* Device to other host */
 706        struct iphdr  *iph;                     /* Our new IP header */
 707        unsigned int max_headroom;              /* The extra header space needed */
 708        int    gre_hlen;
 709        __be32 dst;
 710        int    mtu;
 711
 712        if (dev->type == ARPHRD_ETHER)
 713                IPCB(skb)->flags = 0;
 714
 715        if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
 716                gre_hlen = 0;
 717                tiph = (struct iphdr *)skb->data;
 718        } else {
 719                gre_hlen = tunnel->hlen;
 720                tiph = &tunnel->parms.iph;
 721        }
 722
 723        if ((dst = tiph->daddr) == 0) {
 724                /* NBMA tunnel */
 725
 726                if (skb_dst(skb) == NULL) {
 727                        dev->stats.tx_fifo_errors++;
 728                        goto tx_error;
 729                }
 730
 731                if (skb->protocol == htons(ETH_P_IP)) {
 732                        rt = skb_rtable(skb);
 733                        if ((dst = rt->rt_gateway) == 0)
 734                                goto tx_error_icmp;
 735                }
 736#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 737                else if (skb->protocol == htons(ETH_P_IPV6)) {
 738                        struct in6_addr *addr6;
 739                        int addr_type;
 740                        struct neighbour *neigh = skb_dst(skb)->neighbour;
 741
 742                        if (neigh == NULL)
 743                                goto tx_error;
 744
 745                        addr6 = (struct in6_addr *)&neigh->primary_key;
 746                        addr_type = ipv6_addr_type(addr6);
 747
 748                        if (addr_type == IPV6_ADDR_ANY) {
 749                                addr6 = &ipv6_hdr(skb)->daddr;
 750                                addr_type = ipv6_addr_type(addr6);
 751                        }
 752
 753                        if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
 754                                goto tx_error_icmp;
 755
 756                        dst = addr6->s6_addr32[3];
 757                }
 758#endif
 759                else
 760                        goto tx_error;
 761        }
 762
 763        tos = tiph->tos;
 764        if (tos == 1) {
 765                tos = 0;
 766                if (skb->protocol == htons(ETH_P_IP))
 767                        tos = old_iph->tos;
 768                else if (skb->protocol == htons(ETH_P_IPV6))
 769                        tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
 770        }
 771
 772        {
 773                struct flowi fl = {
 774                        .oif = tunnel->parms.link,
 775                        .fl4_dst = dst,
 776                        .fl4_src = tiph->saddr,
 777                        .fl4_tos = RT_TOS(tos),
 778                        .proto = IPPROTO_GRE,
 779                        .fl_gre_key = tunnel->parms.o_key
 780                };
 781                if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
 782                        dev->stats.tx_carrier_errors++;
 783                        goto tx_error;
 784                }
 785        }
 786        tdev = rt->dst.dev;
 787
 788        if (tdev == dev) {
 789                ip_rt_put(rt);
 790                dev->stats.collisions++;
 791                goto tx_error;
 792        }
 793
 794        df = tiph->frag_off;
 795        if (df)
 796                mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
 797        else
 798                mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 799
 800        if (skb_dst(skb))
 801                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 802
 803        if (skb->protocol == htons(ETH_P_IP)) {
 804                df |= (old_iph->frag_off&htons(IP_DF));
 805
 806                if ((old_iph->frag_off&htons(IP_DF)) &&
 807                    mtu < ntohs(old_iph->tot_len)) {
 808                        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 809                        ip_rt_put(rt);
 810                        goto tx_error;
 811                }
 812        }
 813#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 814        else if (skb->protocol == htons(ETH_P_IPV6)) {
 815                struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
 816
 817                if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
 818                        if ((tunnel->parms.iph.daddr &&
 819                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
 820                            rt6->rt6i_dst.plen == 128) {
 821                                rt6->rt6i_flags |= RTF_MODIFIED;
 822                                dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
 823                        }
 824                }
 825
 826                if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
 827                        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 828                        ip_rt_put(rt);
 829                        goto tx_error;
 830                }
 831        }
 832#endif
 833
 834        if (tunnel->err_count > 0) {
 835                if (time_before(jiffies,
 836                                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
 837                        tunnel->err_count--;
 838
 839                        dst_link_failure(skb);
 840                } else
 841                        tunnel->err_count = 0;
 842        }
 843
 844        max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
 845
 846        if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
 847            (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
 848                struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
 849                if (max_headroom > dev->needed_headroom)
 850                        dev->needed_headroom = max_headroom;
 851                if (!new_skb) {
 852                        ip_rt_put(rt);
 853                        dev->stats.tx_dropped++;
 854                        dev_kfree_skb(skb);
 855                        return NETDEV_TX_OK;
 856                }
 857                if (skb->sk)
 858                        skb_set_owner_w(new_skb, skb->sk);
 859                dev_kfree_skb(skb);
 860                skb = new_skb;
 861                old_iph = ip_hdr(skb);
 862        }
 863
 864        skb_reset_transport_header(skb);
 865        skb_push(skb, gre_hlen);
 866        skb_reset_network_header(skb);
 867        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 868        IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
 869                              IPSKB_REROUTED);
 870        skb_dst_drop(skb);
 871        skb_dst_set(skb, &rt->dst);
 872
 873        /*
 874         *      Push down and install the IPIP header.
 875         */
 876
 877        iph                     =       ip_hdr(skb);
 878        iph->version            =       4;
 879        iph->ihl                =       sizeof(struct iphdr) >> 2;
 880        iph->frag_off           =       df;
 881        iph->protocol           =       IPPROTO_GRE;
 882        iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
 883        iph->daddr              =       rt->rt_dst;
 884        iph->saddr              =       rt->rt_src;
 885
 886        if ((iph->ttl = tiph->ttl) == 0) {
 887                if (skb->protocol == htons(ETH_P_IP))
 888                        iph->ttl = old_iph->ttl;
 889#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 890                else if (skb->protocol == htons(ETH_P_IPV6))
 891                        iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
 892#endif
 893                else
 894                        iph->ttl = ip4_dst_hoplimit(&rt->dst);
 895        }
 896
 897        ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
 898        ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
 899                                   htons(ETH_P_TEB) : skb->protocol;
 900
 901        if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
 902                __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
 903
 904                if (tunnel->parms.o_flags&GRE_SEQ) {
 905                        ++tunnel->o_seqno;
 906                        *ptr = htonl(tunnel->o_seqno);
 907                        ptr--;
 908                }
 909                if (tunnel->parms.o_flags&GRE_KEY) {
 910                        *ptr = tunnel->parms.o_key;
 911                        ptr--;
 912                }
 913                if (tunnel->parms.o_flags&GRE_CSUM) {
 914                        *ptr = 0;
 915                        *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
 916                }
 917        }
 918
 919        nf_reset(skb);
 920        tstats = this_cpu_ptr(dev->tstats);
 921        __IPTUNNEL_XMIT(tstats, &dev->stats);
 922        return NETDEV_TX_OK;
 923
 924tx_error_icmp:
 925        dst_link_failure(skb);
 926
 927tx_error:
 928        dev->stats.tx_errors++;
 929        dev_kfree_skb(skb);
 930        return NETDEV_TX_OK;
 931}
 932
 933static int ipgre_tunnel_bind_dev(struct net_device *dev)
 934{
 935        struct net_device *tdev = NULL;
 936        struct ip_tunnel *tunnel;
 937        struct iphdr *iph;
 938        int hlen = LL_MAX_HEADER;
 939        int mtu = ETH_DATA_LEN;
 940        int addend = sizeof(struct iphdr) + 4;
 941
 942        tunnel = netdev_priv(dev);
 943        iph = &tunnel->parms.iph;
 944
 945        /* Guess output device to choose reasonable mtu and needed_headroom */
 946
 947        if (iph->daddr) {
 948                struct flowi fl = {
 949                        .oif = tunnel->parms.link,
 950                        .fl4_dst = iph->daddr,
 951                        .fl4_src = iph->saddr,
 952                        .fl4_tos = RT_TOS(iph->tos),
 953                        .proto = IPPROTO_GRE,
 954                        .fl_gre_key = tunnel->parms.o_key
 955                };
 956                struct rtable *rt;
 957
 958                if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
 959                        tdev = rt->dst.dev;
 960                        ip_rt_put(rt);
 961                }
 962
 963                if (dev->type != ARPHRD_ETHER)
 964                        dev->flags |= IFF_POINTOPOINT;
 965        }
 966
 967        if (!tdev && tunnel->parms.link)
 968                tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
 969
 970        if (tdev) {
 971                hlen = tdev->hard_header_len + tdev->needed_headroom;
 972                mtu = tdev->mtu;
 973        }
 974        dev->iflink = tunnel->parms.link;
 975
 976        /* Precalculate GRE options length */
 977        if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
 978                if (tunnel->parms.o_flags&GRE_CSUM)
 979                        addend += 4;
 980                if (tunnel->parms.o_flags&GRE_KEY)
 981                        addend += 4;
 982                if (tunnel->parms.o_flags&GRE_SEQ)
 983                        addend += 4;
 984        }
 985        dev->needed_headroom = addend + hlen;
 986        mtu -= dev->hard_header_len + addend;
 987
 988        if (mtu < 68)
 989                mtu = 68;
 990
 991        tunnel->hlen = addend;
 992
 993        return mtu;
 994}
 995
 996static int
 997ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 998{
 999        int err = 0;
1000        struct ip_tunnel_parm p;
1001        struct ip_tunnel *t;
1002        struct net *net = dev_net(dev);
1003        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1004
1005        switch (cmd) {
1006        case SIOCGETTUNNEL:
1007                t = NULL;
1008                if (dev == ign->fb_tunnel_dev) {
1009                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1010                                err = -EFAULT;
1011                                break;
1012                        }
1013                        t = ipgre_tunnel_locate(net, &p, 0);
1014                }
1015                if (t == NULL)
1016                        t = netdev_priv(dev);
1017                memcpy(&p, &t->parms, sizeof(p));
1018                if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1019                        err = -EFAULT;
1020                break;
1021
1022        case SIOCADDTUNNEL:
1023        case SIOCCHGTUNNEL:
1024                err = -EPERM;
1025                if (!capable(CAP_NET_ADMIN))
1026                        goto done;
1027
1028                err = -EFAULT;
1029                if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1030                        goto done;
1031
1032                err = -EINVAL;
1033                if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1034                    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1035                    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1036                        goto done;
1037                if (p.iph.ttl)
1038                        p.iph.frag_off |= htons(IP_DF);
1039
1040                if (!(p.i_flags&GRE_KEY))
1041                        p.i_key = 0;
1042                if (!(p.o_flags&GRE_KEY))
1043                        p.o_key = 0;
1044
1045                t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1046
1047                if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1048                        if (t != NULL) {
1049                                if (t->dev != dev) {
1050                                        err = -EEXIST;
1051                                        break;
1052                                }
1053                        } else {
1054                                unsigned int nflags = 0;
1055
1056                                t = netdev_priv(dev);
1057
1058                                if (ipv4_is_multicast(p.iph.daddr))
1059                                        nflags = IFF_BROADCAST;
1060                                else if (p.iph.daddr)
1061                                        nflags = IFF_POINTOPOINT;
1062
1063                                if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1064                                        err = -EINVAL;
1065                                        break;
1066                                }
1067                                ipgre_tunnel_unlink(ign, t);
1068                                synchronize_net();
1069                                t->parms.iph.saddr = p.iph.saddr;
1070                                t->parms.iph.daddr = p.iph.daddr;
1071                                t->parms.i_key = p.i_key;
1072                                t->parms.o_key = p.o_key;
1073                                memcpy(dev->dev_addr, &p.iph.saddr, 4);
1074                                memcpy(dev->broadcast, &p.iph.daddr, 4);
1075                                ipgre_tunnel_link(ign, t);
1076                                netdev_state_change(dev);
1077                        }
1078                }
1079
1080                if (t) {
1081                        err = 0;
1082                        if (cmd == SIOCCHGTUNNEL) {
1083                                t->parms.iph.ttl = p.iph.ttl;
1084                                t->parms.iph.tos = p.iph.tos;
1085                                t->parms.iph.frag_off = p.iph.frag_off;
1086                                if (t->parms.link != p.link) {
1087                                        t->parms.link = p.link;
1088                                        dev->mtu = ipgre_tunnel_bind_dev(dev);
1089                                        netdev_state_change(dev);
1090                                }
1091                        }
1092                        if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1093                                err = -EFAULT;
1094                } else
1095                        err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1096                break;
1097
1098        case SIOCDELTUNNEL:
1099                err = -EPERM;
1100                if (!capable(CAP_NET_ADMIN))
1101                        goto done;
1102
1103                if (dev == ign->fb_tunnel_dev) {
1104                        err = -EFAULT;
1105                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1106                                goto done;
1107                        err = -ENOENT;
1108                        if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1109                                goto done;
1110                        err = -EPERM;
1111                        if (t == netdev_priv(ign->fb_tunnel_dev))
1112                                goto done;
1113                        dev = t->dev;
1114                }
1115                unregister_netdevice(dev);
1116                err = 0;
1117                break;
1118
1119        default:
1120                err = -EINVAL;
1121        }
1122
1123done:
1124        return err;
1125}
1126
1127static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1128{
1129        struct ip_tunnel *tunnel = netdev_priv(dev);
1130        if (new_mtu < 68 ||
1131            new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1132                return -EINVAL;
1133        dev->mtu = new_mtu;
1134        return 0;
1135}
1136
1137/* Nice toy. Unfortunately, useless in real life :-)
1138   It allows to construct virtual multiprotocol broadcast "LAN"
1139   over the Internet, provided multicast routing is tuned.
1140
1141
1142   I have no idea was this bicycle invented before me,
1143   so that I had to set ARPHRD_IPGRE to a random value.
1144   I have an impression, that Cisco could make something similar,
1145   but this feature is apparently missing in IOS<=11.2(8).
1146
1147   I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1148   with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1149
1150   ping -t 255 224.66.66.66
1151
1152   If nobody answers, mbone does not work.
1153
1154   ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1155   ip addr add 10.66.66.<somewhat>/24 dev Universe
1156   ifconfig Universe up
1157   ifconfig Universe add fe80::<Your_real_addr>/10
1158   ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1159   ftp 10.66.66.66
1160   ...
1161   ftp fec0:6666:6666::193.233.7.65
1162   ...
1163
1164 */
1165
1166static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1167                        unsigned short type,
1168                        const void *daddr, const void *saddr, unsigned int len)
1169{
1170        struct ip_tunnel *t = netdev_priv(dev);
1171        struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1172        __be16 *p = (__be16*)(iph+1);
1173
1174        memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1175        p[0]            = t->parms.o_flags;
1176        p[1]            = htons(type);
1177
1178        /*
1179         *      Set the source hardware address.
1180         */
1181
1182        if (saddr)
1183                memcpy(&iph->saddr, saddr, 4);
1184        if (daddr)
1185                memcpy(&iph->daddr, daddr, 4);
1186        if (iph->daddr)
1187                return t->hlen;
1188
1189        return -t->hlen;
1190}
1191
1192static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1193{
1194        struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1195        memcpy(haddr, &iph->saddr, 4);
1196        return 4;
1197}
1198
1199static const struct header_ops ipgre_header_ops = {
1200        .create = ipgre_header,
1201        .parse  = ipgre_header_parse,
1202};
1203
1204#ifdef CONFIG_NET_IPGRE_BROADCAST
1205static int ipgre_open(struct net_device *dev)
1206{
1207        struct ip_tunnel *t = netdev_priv(dev);
1208
1209        if (ipv4_is_multicast(t->parms.iph.daddr)) {
1210                struct flowi fl = {
1211                        .oif = t->parms.link,
1212                        .fl4_dst = t->parms.iph.daddr,
1213                        .fl4_src = t->parms.iph.saddr,
1214                        .fl4_tos = RT_TOS(t->parms.iph.tos),
1215                        .proto = IPPROTO_GRE,
1216                        .fl_gre_key = t->parms.o_key
1217                };
1218                struct rtable *rt;
1219
1220                if (ip_route_output_key(dev_net(dev), &rt, &fl))
1221                        return -EADDRNOTAVAIL;
1222                dev = rt->dst.dev;
1223                ip_rt_put(rt);
1224                if (__in_dev_get_rtnl(dev) == NULL)
1225                        return -EADDRNOTAVAIL;
1226                t->mlink = dev->ifindex;
1227                ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1228        }
1229        return 0;
1230}
1231
1232static int ipgre_close(struct net_device *dev)
1233{
1234        struct ip_tunnel *t = netdev_priv(dev);
1235
1236        if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1237                struct in_device *in_dev;
1238                in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1239                if (in_dev)
1240                        ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1241        }
1242        return 0;
1243}
1244
1245#endif
1246
1247static const struct net_device_ops ipgre_netdev_ops = {
1248        .ndo_init               = ipgre_tunnel_init,
1249        .ndo_uninit             = ipgre_tunnel_uninit,
1250#ifdef CONFIG_NET_IPGRE_BROADCAST
1251        .ndo_open               = ipgre_open,
1252        .ndo_stop               = ipgre_close,
1253#endif
1254        .ndo_start_xmit         = ipgre_tunnel_xmit,
1255        .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1256        .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1257        .ndo_get_stats          = ipgre_get_stats,
1258};
1259
1260static void ipgre_dev_free(struct net_device *dev)
1261{
1262        free_percpu(dev->tstats);
1263        free_netdev(dev);
1264}
1265
1266static void ipgre_tunnel_setup(struct net_device *dev)
1267{
1268        dev->netdev_ops         = &ipgre_netdev_ops;
1269        dev->destructor         = ipgre_dev_free;
1270
1271        dev->type               = ARPHRD_IPGRE;
1272        dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1273        dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1274        dev->flags              = IFF_NOARP;
1275        dev->iflink             = 0;
1276        dev->addr_len           = 4;
1277        dev->features           |= NETIF_F_NETNS_LOCAL;
1278        dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1279}
1280
1281static int ipgre_tunnel_init(struct net_device *dev)
1282{
1283        struct ip_tunnel *tunnel;
1284        struct iphdr *iph;
1285
1286        tunnel = netdev_priv(dev);
1287        iph = &tunnel->parms.iph;
1288
1289        tunnel->dev = dev;
1290        strcpy(tunnel->parms.name, dev->name);
1291
1292        memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1293        memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1294
1295        if (iph->daddr) {
1296#ifdef CONFIG_NET_IPGRE_BROADCAST
1297                if (ipv4_is_multicast(iph->daddr)) {
1298                        if (!iph->saddr)
1299                                return -EINVAL;
1300                        dev->flags = IFF_BROADCAST;
1301                        dev->header_ops = &ipgre_header_ops;
1302                }
1303#endif
1304        } else
1305                dev->header_ops = &ipgre_header_ops;
1306
1307        dev->tstats = alloc_percpu(struct pcpu_tstats);
1308        if (!dev->tstats)
1309                return -ENOMEM;
1310
1311        return 0;
1312}
1313
1314static void ipgre_fb_tunnel_init(struct net_device *dev)
1315{
1316        struct ip_tunnel *tunnel = netdev_priv(dev);
1317        struct iphdr *iph = &tunnel->parms.iph;
1318
1319        tunnel->dev = dev;
1320        strcpy(tunnel->parms.name, dev->name);
1321
1322        iph->version            = 4;
1323        iph->protocol           = IPPROTO_GRE;
1324        iph->ihl                = 5;
1325        tunnel->hlen            = sizeof(struct iphdr) + 4;
1326
1327        dev_hold(dev);
1328}
1329
1330
1331static const struct gre_protocol ipgre_protocol = {
1332        .handler     = ipgre_rcv,
1333        .err_handler = ipgre_err,
1334};
1335
1336static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1337{
1338        int prio;
1339
1340        for (prio = 0; prio < 4; prio++) {
1341                int h;
1342                for (h = 0; h < HASH_SIZE; h++) {
1343                        struct ip_tunnel *t;
1344
1345                        t = rtnl_dereference(ign->tunnels[prio][h]);
1346
1347                        while (t != NULL) {
1348                                unregister_netdevice_queue(t->dev, head);
1349                                t = rtnl_dereference(t->next);
1350                        }
1351                }
1352        }
1353}
1354
1355static int __net_init ipgre_init_net(struct net *net)
1356{
1357        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1358        int err;
1359
1360        ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1361                                           ipgre_tunnel_setup);
1362        if (!ign->fb_tunnel_dev) {
1363                err = -ENOMEM;
1364                goto err_alloc_dev;
1365        }
1366        dev_net_set(ign->fb_tunnel_dev, net);
1367
1368        ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1369        ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1370
1371        if ((err = register_netdev(ign->fb_tunnel_dev)))
1372                goto err_reg_dev;
1373
1374        rcu_assign_pointer(ign->tunnels_wc[0],
1375                           netdev_priv(ign->fb_tunnel_dev));
1376        return 0;
1377
1378err_reg_dev:
1379        ipgre_dev_free(ign->fb_tunnel_dev);
1380err_alloc_dev:
1381        return err;
1382}
1383
1384static void __net_exit ipgre_exit_net(struct net *net)
1385{
1386        struct ipgre_net *ign;
1387        LIST_HEAD(list);
1388
1389        ign = net_generic(net, ipgre_net_id);
1390        rtnl_lock();
1391        ipgre_destroy_tunnels(ign, &list);
1392        unregister_netdevice_many(&list);
1393        rtnl_unlock();
1394}
1395
1396static struct pernet_operations ipgre_net_ops = {
1397        .init = ipgre_init_net,
1398        .exit = ipgre_exit_net,
1399        .id   = &ipgre_net_id,
1400        .size = sizeof(struct ipgre_net),
1401};
1402
1403static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1404{
1405        __be16 flags;
1406
1407        if (!data)
1408                return 0;
1409
1410        flags = 0;
1411        if (data[IFLA_GRE_IFLAGS])
1412                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1413        if (data[IFLA_GRE_OFLAGS])
1414                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1415        if (flags & (GRE_VERSION|GRE_ROUTING))
1416                return -EINVAL;
1417
1418        return 0;
1419}
1420
1421static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1422{
1423        __be32 daddr;
1424
1425        if (tb[IFLA_ADDRESS]) {
1426                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1427                        return -EINVAL;
1428                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1429                        return -EADDRNOTAVAIL;
1430        }
1431
1432        if (!data)
1433                goto out;
1434
1435        if (data[IFLA_GRE_REMOTE]) {
1436                memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1437                if (!daddr)
1438                        return -EINVAL;
1439        }
1440
1441out:
1442        return ipgre_tunnel_validate(tb, data);
1443}
1444
1445static void ipgre_netlink_parms(struct nlattr *data[],
1446                                struct ip_tunnel_parm *parms)
1447{
1448        memset(parms, 0, sizeof(*parms));
1449
1450        parms->iph.protocol = IPPROTO_GRE;
1451
1452        if (!data)
1453                return;
1454
1455        if (data[IFLA_GRE_LINK])
1456                parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1457
1458        if (data[IFLA_GRE_IFLAGS])
1459                parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1460
1461        if (data[IFLA_GRE_OFLAGS])
1462                parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1463
1464        if (data[IFLA_GRE_IKEY])
1465                parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1466
1467        if (data[IFLA_GRE_OKEY])
1468                parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1469
1470        if (data[IFLA_GRE_LOCAL])
1471                parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1472
1473        if (data[IFLA_GRE_REMOTE])
1474                parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1475
1476        if (data[IFLA_GRE_TTL])
1477                parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1478
1479        if (data[IFLA_GRE_TOS])
1480                parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1481
1482        if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1483                parms->iph.frag_off = htons(IP_DF);
1484}
1485
1486static int ipgre_tap_init(struct net_device *dev)
1487{
1488        struct ip_tunnel *tunnel;
1489
1490        tunnel = netdev_priv(dev);
1491
1492        tunnel->dev = dev;
1493        strcpy(tunnel->parms.name, dev->name);
1494
1495        ipgre_tunnel_bind_dev(dev);
1496
1497        dev->tstats = alloc_percpu(struct pcpu_tstats);
1498        if (!dev->tstats)
1499                return -ENOMEM;
1500
1501        return 0;
1502}
1503
1504static const struct net_device_ops ipgre_tap_netdev_ops = {
1505        .ndo_init               = ipgre_tap_init,
1506        .ndo_uninit             = ipgre_tunnel_uninit,
1507        .ndo_start_xmit         = ipgre_tunnel_xmit,
1508        .ndo_set_mac_address    = eth_mac_addr,
1509        .ndo_validate_addr      = eth_validate_addr,
1510        .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1511        .ndo_get_stats          = ipgre_get_stats,
1512};
1513
1514static void ipgre_tap_setup(struct net_device *dev)
1515{
1516
1517        ether_setup(dev);
1518
1519        dev->netdev_ops         = &ipgre_tap_netdev_ops;
1520        dev->destructor         = ipgre_dev_free;
1521
1522        dev->iflink             = 0;
1523        dev->features           |= NETIF_F_NETNS_LOCAL;
1524}
1525
1526static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1527                         struct nlattr *data[])
1528{
1529        struct ip_tunnel *nt;
1530        struct net *net = dev_net(dev);
1531        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1532        int mtu;
1533        int err;
1534
1535        nt = netdev_priv(dev);
1536        ipgre_netlink_parms(data, &nt->parms);
1537
1538        if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1539                return -EEXIST;
1540
1541        if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1542                random_ether_addr(dev->dev_addr);
1543
1544        mtu = ipgre_tunnel_bind_dev(dev);
1545        if (!tb[IFLA_MTU])
1546                dev->mtu = mtu;
1547
1548        /* Can use a lockless transmit, unless we generate output sequences */
1549        if (!(nt->parms.o_flags & GRE_SEQ))
1550                dev->features |= NETIF_F_LLTX;
1551
1552        err = register_netdevice(dev);
1553        if (err)
1554                goto out;
1555
1556        dev_hold(dev);
1557        ipgre_tunnel_link(ign, nt);
1558
1559out:
1560        return err;
1561}
1562
1563static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1564                            struct nlattr *data[])
1565{
1566        struct ip_tunnel *t, *nt;
1567        struct net *net = dev_net(dev);
1568        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1569        struct ip_tunnel_parm p;
1570        int mtu;
1571
1572        if (dev == ign->fb_tunnel_dev)
1573                return -EINVAL;
1574
1575        nt = netdev_priv(dev);
1576        ipgre_netlink_parms(data, &p);
1577
1578        t = ipgre_tunnel_locate(net, &p, 0);
1579
1580        if (t) {
1581                if (t->dev != dev)
1582                        return -EEXIST;
1583        } else {
1584                t = nt;
1585
1586                if (dev->type != ARPHRD_ETHER) {
1587                        unsigned int nflags = 0;
1588
1589                        if (ipv4_is_multicast(p.iph.daddr))
1590                                nflags = IFF_BROADCAST;
1591                        else if (p.iph.daddr)
1592                                nflags = IFF_POINTOPOINT;
1593
1594                        if ((dev->flags ^ nflags) &
1595                            (IFF_POINTOPOINT | IFF_BROADCAST))
1596                                return -EINVAL;
1597                }
1598
1599                ipgre_tunnel_unlink(ign, t);
1600                t->parms.iph.saddr = p.iph.saddr;
1601                t->parms.iph.daddr = p.iph.daddr;
1602                t->parms.i_key = p.i_key;
1603                if (dev->type != ARPHRD_ETHER) {
1604                        memcpy(dev->dev_addr, &p.iph.saddr, 4);
1605                        memcpy(dev->broadcast, &p.iph.daddr, 4);
1606                }
1607                ipgre_tunnel_link(ign, t);
1608                netdev_state_change(dev);
1609        }
1610
1611        t->parms.o_key = p.o_key;
1612        t->parms.iph.ttl = p.iph.ttl;
1613        t->parms.iph.tos = p.iph.tos;
1614        t->parms.iph.frag_off = p.iph.frag_off;
1615
1616        if (t->parms.link != p.link) {
1617                t->parms.link = p.link;
1618                mtu = ipgre_tunnel_bind_dev(dev);
1619                if (!tb[IFLA_MTU])
1620                        dev->mtu = mtu;
1621                netdev_state_change(dev);
1622        }
1623
1624        return 0;
1625}
1626
1627static size_t ipgre_get_size(const struct net_device *dev)
1628{
1629        return
1630                /* IFLA_GRE_LINK */
1631                nla_total_size(4) +
1632                /* IFLA_GRE_IFLAGS */
1633                nla_total_size(2) +
1634                /* IFLA_GRE_OFLAGS */
1635                nla_total_size(2) +
1636                /* IFLA_GRE_IKEY */
1637                nla_total_size(4) +
1638                /* IFLA_GRE_OKEY */
1639                nla_total_size(4) +
1640                /* IFLA_GRE_LOCAL */
1641                nla_total_size(4) +
1642                /* IFLA_GRE_REMOTE */
1643                nla_total_size(4) +
1644                /* IFLA_GRE_TTL */
1645                nla_total_size(1) +
1646                /* IFLA_GRE_TOS */
1647                nla_total_size(1) +
1648                /* IFLA_GRE_PMTUDISC */
1649                nla_total_size(1) +
1650                0;
1651}
1652
1653static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1654{
1655        struct ip_tunnel *t = netdev_priv(dev);
1656        struct ip_tunnel_parm *p = &t->parms;
1657
1658        NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1659        NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1660        NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1661        NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1662        NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1663        NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1664        NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1665        NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1666        NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1667        NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1668
1669        return 0;
1670
1671nla_put_failure:
1672        return -EMSGSIZE;
1673}
1674
1675static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1676        [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1677        [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1678        [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1679        [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1680        [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1681        [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1682        [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1683        [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1684        [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1685        [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1686};
1687
1688static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1689        .kind           = "gre",
1690        .maxtype        = IFLA_GRE_MAX,
1691        .policy         = ipgre_policy,
1692        .priv_size      = sizeof(struct ip_tunnel),
1693        .setup          = ipgre_tunnel_setup,
1694        .validate       = ipgre_tunnel_validate,
1695        .newlink        = ipgre_newlink,
1696        .changelink     = ipgre_changelink,
1697        .get_size       = ipgre_get_size,
1698        .fill_info      = ipgre_fill_info,
1699};
1700
1701static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1702        .kind           = "gretap",
1703        .maxtype        = IFLA_GRE_MAX,
1704        .policy         = ipgre_policy,
1705        .priv_size      = sizeof(struct ip_tunnel),
1706        .setup          = ipgre_tap_setup,
1707        .validate       = ipgre_tap_validate,
1708        .newlink        = ipgre_newlink,
1709        .changelink     = ipgre_changelink,
1710        .get_size       = ipgre_get_size,
1711        .fill_info      = ipgre_fill_info,
1712};
1713
1714/*
1715 *      And now the modules code and kernel interface.
1716 */
1717
1718static int __init ipgre_init(void)
1719{
1720        int err;
1721
1722        printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1723
1724        err = register_pernet_device(&ipgre_net_ops);
1725        if (err < 0)
1726                return err;
1727
1728        err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1729        if (err < 0) {
1730                printk(KERN_INFO "ipgre init: can't add protocol\n");
1731                goto add_proto_failed;
1732        }
1733
1734        err = rtnl_link_register(&ipgre_link_ops);
1735        if (err < 0)
1736                goto rtnl_link_failed;
1737
1738        err = rtnl_link_register(&ipgre_tap_ops);
1739        if (err < 0)
1740                goto tap_ops_failed;
1741
1742out:
1743        return err;
1744
1745tap_ops_failed:
1746        rtnl_link_unregister(&ipgre_link_ops);
1747rtnl_link_failed:
1748        gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1749add_proto_failed:
1750        unregister_pernet_device(&ipgre_net_ops);
1751        goto out;
1752}
1753
1754static void __exit ipgre_fini(void)
1755{
1756        rtnl_link_unregister(&ipgre_tap_ops);
1757        rtnl_link_unregister(&ipgre_link_ops);
1758        if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1759                printk(KERN_INFO "ipgre close: can't remove protocol\n");
1760        unregister_pernet_device(&ipgre_net_ops);
1761}
1762
1763module_init(ipgre_init);
1764module_exit(ipgre_fini);
1765MODULE_LICENSE("GPL");
1766MODULE_ALIAS_RTNL_LINK("gre");
1767MODULE_ALIAS_RTNL_LINK("gretap");
1768MODULE_ALIAS_NETDEV("gre0");
1769