linux/net/ipv4/ip_gre.c
<<
>>
Prefs
   1/*
   2 *      Linux NET3:     GRE over IP protocol decoder.
   3 *
   4 *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
   5 *
   6 *      This program is free software; you can redistribute it and/or
   7 *      modify it under the terms of the GNU General Public License
   8 *      as published by the Free Software Foundation; either version
   9 *      2 of the License, or (at your option) any later version.
  10 *
  11 */
  12
  13#include <linux/capability.h>
  14#include <linux/module.h>
  15#include <linux/types.h>
  16#include <linux/kernel.h>
  17#include <asm/uaccess.h>
  18#include <linux/skbuff.h>
  19#include <linux/netdevice.h>
  20#include <linux/in.h>
  21#include <linux/tcp.h>
  22#include <linux/udp.h>
  23#include <linux/if_arp.h>
  24#include <linux/mroute.h>
  25#include <linux/init.h>
  26#include <linux/in6.h>
  27#include <linux/inetdevice.h>
  28#include <linux/igmp.h>
  29#include <linux/netfilter_ipv4.h>
  30#include <linux/etherdevice.h>
  31#include <linux/if_ether.h>
  32
  33#include <net/sock.h>
  34#include <net/ip.h>
  35#include <net/icmp.h>
  36#include <net/protocol.h>
  37#include <net/ipip.h>
  38#include <net/arp.h>
  39#include <net/checksum.h>
  40#include <net/dsfield.h>
  41#include <net/inet_ecn.h>
  42#include <net/xfrm.h>
  43#include <net/net_namespace.h>
  44#include <net/netns/generic.h>
  45#include <net/rtnetlink.h>
  46
  47#ifdef CONFIG_IPV6
  48#include <net/ipv6.h>
  49#include <net/ip6_fib.h>
  50#include <net/ip6_route.h>
  51#endif
  52
  53/*
  54   Problems & solutions
  55   --------------------
  56
  57   1. The most important issue is detecting local dead loops.
  58   They would cause complete host lockup in transmit, which
  59   would be "resolved" by stack overflow or, if queueing is enabled,
  60   with infinite looping in net_bh.
  61
  62   We cannot track such dead loops during route installation,
  63   it is infeasible task. The most general solutions would be
  64   to keep skb->encapsulation counter (sort of local ttl),
  65   and silently drop packet when it expires. It is the best
  66   solution, but it supposes maintaing new variable in ALL
  67   skb, even if no tunneling is used.
  68
  69   Current solution: HARD_TX_LOCK lock breaks dead loops.
  70
  71
  72
  73   2. Networking dead loops would not kill routers, but would really
  74   kill network. IP hop limit plays role of "t->recursion" in this case,
  75   if we copy it from packet being encapsulated to upper header.
  76   It is very good solution, but it introduces two problems:
  77
  78   - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
  79     do not work over tunnels.
  80   - traceroute does not work. I planned to relay ICMP from tunnel,
  81     so that this problem would be solved and traceroute output
  82     would even more informative. This idea appeared to be wrong:
  83     only Linux complies to rfc1812 now (yes, guys, Linux is the only
  84     true router now :-)), all routers (at least, in neighbourhood of mine)
  85     return only 8 bytes of payload. It is the end.
  86
  87   Hence, if we want that OSPF worked or traceroute said something reasonable,
  88   we should search for another solution.
  89
  90   One of them is to parse packet trying to detect inner encapsulation
  91   made by our node. It is difficult or even impossible, especially,
  92   taking into account fragmentation. TO be short, tt is not solution at all.
  93
  94   Current solution: The solution was UNEXPECTEDLY SIMPLE.
  95   We force DF flag on tunnels with preconfigured hop limit,
  96   that is ALL. :-) Well, it does not remove the problem completely,
  97   but exponential growth of network traffic is changed to linear
  98   (branches, that exceed pmtu are pruned) and tunnel mtu
  99   fastly degrades to value <68, where looping stops.
 100   Yes, it is not good if there exists a router in the loop,
 101   which does not force DF, even when encapsulating packets have DF set.
 102   But it is not our problem! Nobody could accuse us, we made
 103   all that we could make. Even if it is your gated who injected
 104   fatal route to network, even if it were you who configured
 105   fatal static route: you are innocent. :-)
 106
 107
 108
 109   3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
 110   practically identical code. It would be good to glue them
 111   together, but it is not very evident, how to make them modular.
 112   sit is integral part of IPv6, ipip and gre are naturally modular.
 113   We could extract common parts (hash table, ioctl etc)
 114   to a separate module (ip_tunnel.c).
 115
 116   Alexey Kuznetsov.
 117 */
 118
 119static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 120static int ipgre_tunnel_init(struct net_device *dev);
 121static void ipgre_tunnel_setup(struct net_device *dev);
 122static int ipgre_tunnel_bind_dev(struct net_device *dev);
 123
 124/* Fallback tunnel: no source, no destination, no key, no options */
 125
 126#define HASH_SIZE  16
 127
 128static int ipgre_net_id;
 129struct ipgre_net {
 130        struct ip_tunnel *tunnels[4][HASH_SIZE];
 131
 132        struct net_device *fb_tunnel_dev;
 133};
 134
 135/* Tunnel hash table */
 136
 137/*
 138   4 hash tables:
 139
 140   3: (remote,local)
 141   2: (remote,*)
 142   1: (*,local)
 143   0: (*,*)
 144
 145   We require exact key match i.e. if a key is present in packet
 146   it will match only tunnel with the same key; if it is not present,
 147   it will match only keyless tunnel.
 148
 149   All keysless packets, if not matched configured keyless tunnels
 150   will match fallback tunnel.
 151 */
 152
 153#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
 154
 155#define tunnels_r_l     tunnels[3]
 156#define tunnels_r       tunnels[2]
 157#define tunnels_l       tunnels[1]
 158#define tunnels_wc      tunnels[0]
 159
 160static DEFINE_RWLOCK(ipgre_lock);
 161
 162/* Given src, dst and key, find appropriate for input tunnel. */
 163
 164static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
 165                                              __be32 remote, __be32 local,
 166                                              __be32 key, __be16 gre_proto)
 167{
 168        struct net *net = dev_net(dev);
 169        int link = dev->ifindex;
 170        unsigned h0 = HASH(remote);
 171        unsigned h1 = HASH(key);
 172        struct ip_tunnel *t, *cand = NULL;
 173        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 174        int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
 175                       ARPHRD_ETHER : ARPHRD_IPGRE;
 176        int score, cand_score = 4;
 177
 178        for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
 179                if (local != t->parms.iph.saddr ||
 180                    remote != t->parms.iph.daddr ||
 181                    key != t->parms.i_key ||
 182                    !(t->dev->flags & IFF_UP))
 183                        continue;
 184
 185                if (t->dev->type != ARPHRD_IPGRE &&
 186                    t->dev->type != dev_type)
 187                        continue;
 188
 189                score = 0;
 190                if (t->parms.link != link)
 191                        score |= 1;
 192                if (t->dev->type != dev_type)
 193                        score |= 2;
 194                if (score == 0)
 195                        return t;
 196
 197                if (score < cand_score) {
 198                        cand = t;
 199                        cand_score = score;
 200                }
 201        }
 202
 203        for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
 204                if (remote != t->parms.iph.daddr ||
 205                    key != t->parms.i_key ||
 206                    !(t->dev->flags & IFF_UP))
 207                        continue;
 208
 209                if (t->dev->type != ARPHRD_IPGRE &&
 210                    t->dev->type != dev_type)
 211                        continue;
 212
 213                score = 0;
 214                if (t->parms.link != link)
 215                        score |= 1;
 216                if (t->dev->type != dev_type)
 217                        score |= 2;
 218                if (score == 0)
 219                        return t;
 220
 221                if (score < cand_score) {
 222                        cand = t;
 223                        cand_score = score;
 224                }
 225        }
 226
 227        for (t = ign->tunnels_l[h1]; t; t = t->next) {
 228                if ((local != t->parms.iph.saddr &&
 229                     (local != t->parms.iph.daddr ||
 230                      !ipv4_is_multicast(local))) ||
 231                    key != t->parms.i_key ||
 232                    !(t->dev->flags & IFF_UP))
 233                        continue;
 234
 235                if (t->dev->type != ARPHRD_IPGRE &&
 236                    t->dev->type != dev_type)
 237                        continue;
 238
 239                score = 0;
 240                if (t->parms.link != link)
 241                        score |= 1;
 242                if (t->dev->type != dev_type)
 243                        score |= 2;
 244                if (score == 0)
 245                        return t;
 246
 247                if (score < cand_score) {
 248                        cand = t;
 249                        cand_score = score;
 250                }
 251        }
 252
 253        for (t = ign->tunnels_wc[h1]; t; t = t->next) {
 254                if (t->parms.i_key != key ||
 255                    !(t->dev->flags & IFF_UP))
 256                        continue;
 257
 258                if (t->dev->type != ARPHRD_IPGRE &&
 259                    t->dev->type != dev_type)
 260                        continue;
 261
 262                score = 0;
 263                if (t->parms.link != link)
 264                        score |= 1;
 265                if (t->dev->type != dev_type)
 266                        score |= 2;
 267                if (score == 0)
 268                        return t;
 269
 270                if (score < cand_score) {
 271                        cand = t;
 272                        cand_score = score;
 273                }
 274        }
 275
 276        if (cand != NULL)
 277                return cand;
 278
 279        if (ign->fb_tunnel_dev->flags & IFF_UP)
 280                return netdev_priv(ign->fb_tunnel_dev);
 281
 282        return NULL;
 283}
 284
 285static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
 286                struct ip_tunnel_parm *parms)
 287{
 288        __be32 remote = parms->iph.daddr;
 289        __be32 local = parms->iph.saddr;
 290        __be32 key = parms->i_key;
 291        unsigned h = HASH(key);
 292        int prio = 0;
 293
 294        if (local)
 295                prio |= 1;
 296        if (remote && !ipv4_is_multicast(remote)) {
 297                prio |= 2;
 298                h ^= HASH(remote);
 299        }
 300
 301        return &ign->tunnels[prio][h];
 302}
 303
 304static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
 305                struct ip_tunnel *t)
 306{
 307        return __ipgre_bucket(ign, &t->parms);
 308}
 309
 310static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
 311{
 312        struct ip_tunnel **tp = ipgre_bucket(ign, t);
 313
 314        t->next = *tp;
 315        write_lock_bh(&ipgre_lock);
 316        *tp = t;
 317        write_unlock_bh(&ipgre_lock);
 318}
 319
 320static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
 321{
 322        struct ip_tunnel **tp;
 323
 324        for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
 325                if (t == *tp) {
 326                        write_lock_bh(&ipgre_lock);
 327                        *tp = t->next;
 328                        write_unlock_bh(&ipgre_lock);
 329                        break;
 330                }
 331        }
 332}
 333
 334static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
 335                                           struct ip_tunnel_parm *parms,
 336                                           int type)
 337{
 338        __be32 remote = parms->iph.daddr;
 339        __be32 local = parms->iph.saddr;
 340        __be32 key = parms->i_key;
 341        int link = parms->link;
 342        struct ip_tunnel *t, **tp;
 343        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 344
 345        for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
 346                if (local == t->parms.iph.saddr &&
 347                    remote == t->parms.iph.daddr &&
 348                    key == t->parms.i_key &&
 349                    link == t->parms.link &&
 350                    type == t->dev->type)
 351                        break;
 352
 353        return t;
 354}
 355
 356static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
 357                struct ip_tunnel_parm *parms, int create)
 358{
 359        struct ip_tunnel *t, *nt;
 360        struct net_device *dev;
 361        char name[IFNAMSIZ];
 362        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 363
 364        t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
 365        if (t || !create)
 366                return t;
 367
 368        if (parms->name[0])
 369                strlcpy(name, parms->name, IFNAMSIZ);
 370        else
 371                sprintf(name, "gre%%d");
 372
 373        dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
 374        if (!dev)
 375          return NULL;
 376
 377        dev_net_set(dev, net);
 378
 379        if (strchr(name, '%')) {
 380                if (dev_alloc_name(dev, name) < 0)
 381                        goto failed_free;
 382        }
 383
 384        nt = netdev_priv(dev);
 385        nt->parms = *parms;
 386        dev->rtnl_link_ops = &ipgre_link_ops;
 387
 388        dev->mtu = ipgre_tunnel_bind_dev(dev);
 389
 390        if (register_netdevice(dev) < 0)
 391                goto failed_free;
 392
 393        dev_hold(dev);
 394        ipgre_tunnel_link(ign, nt);
 395        return nt;
 396
 397failed_free:
 398        free_netdev(dev);
 399        return NULL;
 400}
 401
 402static void ipgre_tunnel_uninit(struct net_device *dev)
 403{
 404        struct net *net = dev_net(dev);
 405        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 406
 407        ipgre_tunnel_unlink(ign, netdev_priv(dev));
 408        dev_put(dev);
 409}
 410
 411
 412static void ipgre_err(struct sk_buff *skb, u32 info)
 413{
 414
 415/* All the routers (except for Linux) return only
 416   8 bytes of packet payload. It means, that precise relaying of
 417   ICMP in the real Internet is absolutely infeasible.
 418
 419   Moreover, Cisco "wise men" put GRE key to the third word
 420   in GRE header. It makes impossible maintaining even soft state for keyed
 421   GRE tunnels with enabled checksum. Tell them "thank you".
 422
 423   Well, I wonder, rfc1812 was written by Cisco employee,
 424   what the hell these idiots break standrads established
 425   by themself???
 426 */
 427
 428        struct iphdr *iph = (struct iphdr *)skb->data;
 429        __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
 430        int grehlen = (iph->ihl<<2) + 4;
 431        const int type = icmp_hdr(skb)->type;
 432        const int code = icmp_hdr(skb)->code;
 433        struct ip_tunnel *t;
 434        __be16 flags;
 435
 436        flags = p[0];
 437        if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
 438                if (flags&(GRE_VERSION|GRE_ROUTING))
 439                        return;
 440                if (flags&GRE_KEY) {
 441                        grehlen += 4;
 442                        if (flags&GRE_CSUM)
 443                                grehlen += 4;
 444                }
 445        }
 446
 447        /* If only 8 bytes returned, keyed message will be dropped here */
 448        if (skb_headlen(skb) < grehlen)
 449                return;
 450
 451        switch (type) {
 452        default:
 453        case ICMP_PARAMETERPROB:
 454                return;
 455
 456        case ICMP_DEST_UNREACH:
 457                switch (code) {
 458                case ICMP_SR_FAILED:
 459                case ICMP_PORT_UNREACH:
 460                        /* Impossible event. */
 461                        return;
 462                case ICMP_FRAG_NEEDED:
 463                        /* Soft state for pmtu is maintained by IP core. */
 464                        return;
 465                default:
 466                        /* All others are translated to HOST_UNREACH.
 467                           rfc2003 contains "deep thoughts" about NET_UNREACH,
 468                           I believe they are just ether pollution. --ANK
 469                         */
 470                        break;
 471                }
 472                break;
 473        case ICMP_TIME_EXCEEDED:
 474                if (code != ICMP_EXC_TTL)
 475                        return;
 476                break;
 477        }
 478
 479        read_lock(&ipgre_lock);
 480        t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
 481                                flags & GRE_KEY ?
 482                                *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
 483                                p[1]);
 484        if (t == NULL || t->parms.iph.daddr == 0 ||
 485            ipv4_is_multicast(t->parms.iph.daddr))
 486                goto out;
 487
 488        if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
 489                goto out;
 490
 491        if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
 492                t->err_count++;
 493        else
 494                t->err_count = 1;
 495        t->err_time = jiffies;
 496out:
 497        read_unlock(&ipgre_lock);
 498        return;
 499}
 500
 501static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
 502{
 503        if (INET_ECN_is_ce(iph->tos)) {
 504                if (skb->protocol == htons(ETH_P_IP)) {
 505                        IP_ECN_set_ce(ip_hdr(skb));
 506                } else if (skb->protocol == htons(ETH_P_IPV6)) {
 507                        IP6_ECN_set_ce(ipv6_hdr(skb));
 508                }
 509        }
 510}
 511
 512static inline u8
 513ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
 514{
 515        u8 inner = 0;
 516        if (skb->protocol == htons(ETH_P_IP))
 517                inner = old_iph->tos;
 518        else if (skb->protocol == htons(ETH_P_IPV6))
 519                inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
 520        return INET_ECN_encapsulate(tos, inner);
 521}
 522
 523static int ipgre_rcv(struct sk_buff *skb)
 524{
 525        struct iphdr *iph;
 526        u8     *h;
 527        __be16    flags;
 528        __sum16   csum = 0;
 529        __be32 key = 0;
 530        u32    seqno = 0;
 531        struct ip_tunnel *tunnel;
 532        int    offset = 4;
 533        __be16 gre_proto;
 534        unsigned int len;
 535
 536        if (!pskb_may_pull(skb, 16))
 537                goto drop_nolock;
 538
 539        iph = ip_hdr(skb);
 540        h = skb->data;
 541        flags = *(__be16*)h;
 542
 543        if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
 544                /* - Version must be 0.
 545                   - We do not support routing headers.
 546                 */
 547                if (flags&(GRE_VERSION|GRE_ROUTING))
 548                        goto drop_nolock;
 549
 550                if (flags&GRE_CSUM) {
 551                        switch (skb->ip_summed) {
 552                        case CHECKSUM_COMPLETE:
 553                                csum = csum_fold(skb->csum);
 554                                if (!csum)
 555                                        break;
 556                                /* fall through */
 557                        case CHECKSUM_NONE:
 558                                skb->csum = 0;
 559                                csum = __skb_checksum_complete(skb);
 560                                skb->ip_summed = CHECKSUM_COMPLETE;
 561                        }
 562                        offset += 4;
 563                }
 564                if (flags&GRE_KEY) {
 565                        key = *(__be32*)(h + offset);
 566                        offset += 4;
 567                }
 568                if (flags&GRE_SEQ) {
 569                        seqno = ntohl(*(__be32*)(h + offset));
 570                        offset += 4;
 571                }
 572        }
 573
 574        gre_proto = *(__be16 *)(h + 2);
 575
 576        read_lock(&ipgre_lock);
 577        if ((tunnel = ipgre_tunnel_lookup(skb->dev,
 578                                          iph->saddr, iph->daddr, key,
 579                                          gre_proto))) {
 580                struct net_device_stats *stats = &tunnel->dev->stats;
 581
 582                secpath_reset(skb);
 583
 584                skb->protocol = gre_proto;
 585                /* WCCP version 1 and 2 protocol decoding.
 586                 * - Change protocol to IP
 587                 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
 588                 */
 589                if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
 590                        skb->protocol = htons(ETH_P_IP);
 591                        if ((*(h + offset) & 0xF0) != 0x40)
 592                                offset += 4;
 593                }
 594
 595                skb->mac_header = skb->network_header;
 596                __pskb_pull(skb, offset);
 597                skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
 598                skb->pkt_type = PACKET_HOST;
 599#ifdef CONFIG_NET_IPGRE_BROADCAST
 600                if (ipv4_is_multicast(iph->daddr)) {
 601                        /* Looped back packet, drop it! */
 602                        if (skb_rtable(skb)->fl.iif == 0)
 603                                goto drop;
 604                        stats->multicast++;
 605                        skb->pkt_type = PACKET_BROADCAST;
 606                }
 607#endif
 608
 609                if (((flags&GRE_CSUM) && csum) ||
 610                    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
 611                        stats->rx_crc_errors++;
 612                        stats->rx_errors++;
 613                        goto drop;
 614                }
 615                if (tunnel->parms.i_flags&GRE_SEQ) {
 616                        if (!(flags&GRE_SEQ) ||
 617                            (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
 618                                stats->rx_fifo_errors++;
 619                                stats->rx_errors++;
 620                                goto drop;
 621                        }
 622                        tunnel->i_seqno = seqno + 1;
 623                }
 624
 625                len = skb->len;
 626
 627                /* Warning: All skb pointers will be invalidated! */
 628                if (tunnel->dev->type == ARPHRD_ETHER) {
 629                        if (!pskb_may_pull(skb, ETH_HLEN)) {
 630                                stats->rx_length_errors++;
 631                                stats->rx_errors++;
 632                                goto drop;
 633                        }
 634
 635                        iph = ip_hdr(skb);
 636                        skb->protocol = eth_type_trans(skb, tunnel->dev);
 637                        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 638                }
 639
 640                stats->rx_packets++;
 641                stats->rx_bytes += len;
 642                skb->dev = tunnel->dev;
 643                skb_dst_drop(skb);
 644                nf_reset(skb);
 645
 646                skb_reset_network_header(skb);
 647                ipgre_ecn_decapsulate(iph, skb);
 648
 649                netif_rx(skb);
 650                read_unlock(&ipgre_lock);
 651                return(0);
 652        }
 653        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 654
 655drop:
 656        read_unlock(&ipgre_lock);
 657drop_nolock:
 658        kfree_skb(skb);
 659        return(0);
 660}
 661
 662static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 663{
 664        struct ip_tunnel *tunnel = netdev_priv(dev);
 665        struct net_device_stats *stats = &tunnel->dev->stats;
 666        struct iphdr  *old_iph = ip_hdr(skb);
 667        struct iphdr  *tiph;
 668        u8     tos;
 669        __be16 df;
 670        struct rtable *rt;                      /* Route to the other host */
 671        struct net_device *tdev;                        /* Device to other host */
 672        struct iphdr  *iph;                     /* Our new IP header */
 673        unsigned int max_headroom;              /* The extra header space needed */
 674        int    gre_hlen;
 675        __be32 dst;
 676        int    mtu;
 677
 678        if (dev->type == ARPHRD_ETHER)
 679                IPCB(skb)->flags = 0;
 680
 681        if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
 682                gre_hlen = 0;
 683                tiph = (struct iphdr *)skb->data;
 684        } else {
 685                gre_hlen = tunnel->hlen;
 686                tiph = &tunnel->parms.iph;
 687        }
 688
 689        if ((dst = tiph->daddr) == 0) {
 690                /* NBMA tunnel */
 691
 692                if (skb_dst(skb) == NULL) {
 693                        stats->tx_fifo_errors++;
 694                        goto tx_error;
 695                }
 696
 697                if (skb->protocol == htons(ETH_P_IP)) {
 698                        rt = skb_rtable(skb);
 699                        if ((dst = rt->rt_gateway) == 0)
 700                                goto tx_error_icmp;
 701                }
 702#ifdef CONFIG_IPV6
 703                else if (skb->protocol == htons(ETH_P_IPV6)) {
 704                        struct in6_addr *addr6;
 705                        int addr_type;
 706                        struct neighbour *neigh = skb_dst(skb)->neighbour;
 707
 708                        if (neigh == NULL)
 709                                goto tx_error;
 710
 711                        addr6 = (struct in6_addr *)&neigh->primary_key;
 712                        addr_type = ipv6_addr_type(addr6);
 713
 714                        if (addr_type == IPV6_ADDR_ANY) {
 715                                addr6 = &ipv6_hdr(skb)->daddr;
 716                                addr_type = ipv6_addr_type(addr6);
 717                        }
 718
 719                        if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
 720                                goto tx_error_icmp;
 721
 722                        dst = addr6->s6_addr32[3];
 723                }
 724#endif
 725                else
 726                        goto tx_error;
 727        }
 728
 729        tos = tiph->tos;
 730        if (tos == 1) {
 731                tos = 0;
 732                if (skb->protocol == htons(ETH_P_IP))
 733                        tos = old_iph->tos;
 734        }
 735
 736        {
 737                struct flowi fl = { .oif = tunnel->parms.link,
 738                                    .nl_u = { .ip4_u =
 739                                              { .daddr = dst,
 740                                                .saddr = tiph->saddr,
 741                                                .tos = RT_TOS(tos) } },
 742                                    .proto = IPPROTO_GRE };
 743                if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
 744                        stats->tx_carrier_errors++;
 745                        goto tx_error;
 746                }
 747        }
 748        tdev = rt->u.dst.dev;
 749
 750        if (tdev == dev) {
 751                ip_rt_put(rt);
 752                stats->collisions++;
 753                goto tx_error;
 754        }
 755
 756        df = tiph->frag_off;
 757        if (df)
 758                mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
 759        else
 760                mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 761
 762        if (skb_dst(skb))
 763                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 764
 765        if (skb->protocol == htons(ETH_P_IP)) {
 766                df |= (old_iph->frag_off&htons(IP_DF));
 767
 768                if ((old_iph->frag_off&htons(IP_DF)) &&
 769                    mtu < ntohs(old_iph->tot_len)) {
 770                        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 771                        ip_rt_put(rt);
 772                        goto tx_error;
 773                }
 774        }
 775#ifdef CONFIG_IPV6
 776        else if (skb->protocol == htons(ETH_P_IPV6)) {
 777                struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
 778
 779                if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
 780                        if ((tunnel->parms.iph.daddr &&
 781                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
 782                            rt6->rt6i_dst.plen == 128) {
 783                                rt6->rt6i_flags |= RTF_MODIFIED;
 784                                skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
 785                        }
 786                }
 787
 788                if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
 789                        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
 790                        ip_rt_put(rt);
 791                        goto tx_error;
 792                }
 793        }
 794#endif
 795
 796        if (tunnel->err_count > 0) {
 797                if (time_before(jiffies,
 798                                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
 799                        tunnel->err_count--;
 800
 801                        dst_link_failure(skb);
 802                } else
 803                        tunnel->err_count = 0;
 804        }
 805
 806        max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
 807
 808        if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
 809            (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
 810                struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
 811                if (!new_skb) {
 812                        ip_rt_put(rt);
 813                        stats->tx_dropped++;
 814                        dev_kfree_skb(skb);
 815                        return NETDEV_TX_OK;
 816                }
 817                if (skb->sk)
 818                        skb_set_owner_w(new_skb, skb->sk);
 819                dev_kfree_skb(skb);
 820                skb = new_skb;
 821                old_iph = ip_hdr(skb);
 822        }
 823
 824        skb_reset_transport_header(skb);
 825        skb_push(skb, gre_hlen);
 826        skb_reset_network_header(skb);
 827        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 828        IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
 829                              IPSKB_REROUTED);
 830        skb_dst_drop(skb);
 831        skb_dst_set(skb, &rt->u.dst);
 832
 833        /*
 834         *      Push down and install the IPIP header.
 835         */
 836
 837        iph                     =       ip_hdr(skb);
 838        iph->version            =       4;
 839        iph->ihl                =       sizeof(struct iphdr) >> 2;
 840        iph->frag_off           =       df;
 841        iph->protocol           =       IPPROTO_GRE;
 842        iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
 843        iph->daddr              =       rt->rt_dst;
 844        iph->saddr              =       rt->rt_src;
 845
 846        if ((iph->ttl = tiph->ttl) == 0) {
 847                if (skb->protocol == htons(ETH_P_IP))
 848                        iph->ttl = old_iph->ttl;
 849#ifdef CONFIG_IPV6
 850                else if (skb->protocol == htons(ETH_P_IPV6))
 851                        iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
 852#endif
 853                else
 854                        iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
 855        }
 856
 857        ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
 858        ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
 859                                   htons(ETH_P_TEB) : skb->protocol;
 860
 861        if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
 862                __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
 863
 864                if (tunnel->parms.o_flags&GRE_SEQ) {
 865                        ++tunnel->o_seqno;
 866                        *ptr = htonl(tunnel->o_seqno);
 867                        ptr--;
 868                }
 869                if (tunnel->parms.o_flags&GRE_KEY) {
 870                        *ptr = tunnel->parms.o_key;
 871                        ptr--;
 872                }
 873                if (tunnel->parms.o_flags&GRE_CSUM) {
 874                        *ptr = 0;
 875                        *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
 876                }
 877        }
 878
 879        nf_reset(skb);
 880
 881        IPTUNNEL_XMIT();
 882        return NETDEV_TX_OK;
 883
 884tx_error_icmp:
 885        dst_link_failure(skb);
 886
 887tx_error:
 888        stats->tx_errors++;
 889        dev_kfree_skb(skb);
 890        return NETDEV_TX_OK;
 891}
 892
 893static int ipgre_tunnel_bind_dev(struct net_device *dev)
 894{
 895        struct net_device *tdev = NULL;
 896        struct ip_tunnel *tunnel;
 897        struct iphdr *iph;
 898        int hlen = LL_MAX_HEADER;
 899        int mtu = ETH_DATA_LEN;
 900        int addend = sizeof(struct iphdr) + 4;
 901
 902        tunnel = netdev_priv(dev);
 903        iph = &tunnel->parms.iph;
 904
 905        /* Guess output device to choose reasonable mtu and needed_headroom */
 906
 907        if (iph->daddr) {
 908                struct flowi fl = { .oif = tunnel->parms.link,
 909                                    .nl_u = { .ip4_u =
 910                                              { .daddr = iph->daddr,
 911                                                .saddr = iph->saddr,
 912                                                .tos = RT_TOS(iph->tos) } },
 913                                    .proto = IPPROTO_GRE };
 914                struct rtable *rt;
 915                if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
 916                        tdev = rt->u.dst.dev;
 917                        ip_rt_put(rt);
 918                }
 919
 920                if (dev->type != ARPHRD_ETHER)
 921                        dev->flags |= IFF_POINTOPOINT;
 922        }
 923
 924        if (!tdev && tunnel->parms.link)
 925                tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
 926
 927        if (tdev) {
 928                hlen = tdev->hard_header_len + tdev->needed_headroom;
 929                mtu = tdev->mtu;
 930        }
 931        dev->iflink = tunnel->parms.link;
 932
 933        /* Precalculate GRE options length */
 934        if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
 935                if (tunnel->parms.o_flags&GRE_CSUM)
 936                        addend += 4;
 937                if (tunnel->parms.o_flags&GRE_KEY)
 938                        addend += 4;
 939                if (tunnel->parms.o_flags&GRE_SEQ)
 940                        addend += 4;
 941        }
 942        dev->needed_headroom = addend + hlen;
 943        mtu -= dev->hard_header_len + addend;
 944
 945        if (mtu < 68)
 946                mtu = 68;
 947
 948        tunnel->hlen = addend;
 949
 950        return mtu;
 951}
 952
 953static int
 954ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 955{
 956        int err = 0;
 957        struct ip_tunnel_parm p;
 958        struct ip_tunnel *t;
 959        struct net *net = dev_net(dev);
 960        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 961
 962        switch (cmd) {
 963        case SIOCGETTUNNEL:
 964                t = NULL;
 965                if (dev == ign->fb_tunnel_dev) {
 966                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
 967                                err = -EFAULT;
 968                                break;
 969                        }
 970                        t = ipgre_tunnel_locate(net, &p, 0);
 971                }
 972                if (t == NULL)
 973                        t = netdev_priv(dev);
 974                memcpy(&p, &t->parms, sizeof(p));
 975                if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
 976                        err = -EFAULT;
 977                break;
 978
 979        case SIOCADDTUNNEL:
 980        case SIOCCHGTUNNEL:
 981                err = -EPERM;
 982                if (!capable(CAP_NET_ADMIN))
 983                        goto done;
 984
 985                err = -EFAULT;
 986                if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
 987                        goto done;
 988
 989                err = -EINVAL;
 990                if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
 991                    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
 992                    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
 993                        goto done;
 994                if (p.iph.ttl)
 995                        p.iph.frag_off |= htons(IP_DF);
 996
 997                if (!(p.i_flags&GRE_KEY))
 998                        p.i_key = 0;
 999                if (!(p.o_flags&GRE_KEY))
1000                        p.o_key = 0;
1001
1002                t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1003
1004                if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1005                        if (t != NULL) {
1006                                if (t->dev != dev) {
1007                                        err = -EEXIST;
1008                                        break;
1009                                }
1010                        } else {
1011                                unsigned nflags = 0;
1012
1013                                t = netdev_priv(dev);
1014
1015                                if (ipv4_is_multicast(p.iph.daddr))
1016                                        nflags = IFF_BROADCAST;
1017                                else if (p.iph.daddr)
1018                                        nflags = IFF_POINTOPOINT;
1019
1020                                if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1021                                        err = -EINVAL;
1022                                        break;
1023                                }
1024                                ipgre_tunnel_unlink(ign, t);
1025                                t->parms.iph.saddr = p.iph.saddr;
1026                                t->parms.iph.daddr = p.iph.daddr;
1027                                t->parms.i_key = p.i_key;
1028                                t->parms.o_key = p.o_key;
1029                                memcpy(dev->dev_addr, &p.iph.saddr, 4);
1030                                memcpy(dev->broadcast, &p.iph.daddr, 4);
1031                                ipgre_tunnel_link(ign, t);
1032                                netdev_state_change(dev);
1033                        }
1034                }
1035
1036                if (t) {
1037                        err = 0;
1038                        if (cmd == SIOCCHGTUNNEL) {
1039                                t->parms.iph.ttl = p.iph.ttl;
1040                                t->parms.iph.tos = p.iph.tos;
1041                                t->parms.iph.frag_off = p.iph.frag_off;
1042                                if (t->parms.link != p.link) {
1043                                        t->parms.link = p.link;
1044                                        dev->mtu = ipgre_tunnel_bind_dev(dev);
1045                                        netdev_state_change(dev);
1046                                }
1047                        }
1048                        if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1049                                err = -EFAULT;
1050                } else
1051                        err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1052                break;
1053
1054        case SIOCDELTUNNEL:
1055                err = -EPERM;
1056                if (!capable(CAP_NET_ADMIN))
1057                        goto done;
1058
1059                if (dev == ign->fb_tunnel_dev) {
1060                        err = -EFAULT;
1061                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1062                                goto done;
1063                        err = -ENOENT;
1064                        if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1065                                goto done;
1066                        err = -EPERM;
1067                        if (t == netdev_priv(ign->fb_tunnel_dev))
1068                                goto done;
1069                        dev = t->dev;
1070                }
1071                unregister_netdevice(dev);
1072                err = 0;
1073                break;
1074
1075        default:
1076                err = -EINVAL;
1077        }
1078
1079done:
1080        return err;
1081}
1082
1083static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1084{
1085        struct ip_tunnel *tunnel = netdev_priv(dev);
1086        if (new_mtu < 68 ||
1087            new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1088                return -EINVAL;
1089        dev->mtu = new_mtu;
1090        return 0;
1091}
1092
1093/* Nice toy. Unfortunately, useless in real life :-)
1094   It allows to construct virtual multiprotocol broadcast "LAN"
1095   over the Internet, provided multicast routing is tuned.
1096
1097
1098   I have no idea was this bicycle invented before me,
1099   so that I had to set ARPHRD_IPGRE to a random value.
1100   I have an impression, that Cisco could make something similar,
1101   but this feature is apparently missing in IOS<=11.2(8).
1102
1103   I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1104   with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1105
1106   ping -t 255 224.66.66.66
1107
1108   If nobody answers, mbone does not work.
1109
1110   ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1111   ip addr add 10.66.66.<somewhat>/24 dev Universe
1112   ifconfig Universe up
1113   ifconfig Universe add fe80::<Your_real_addr>/10
1114   ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1115   ftp 10.66.66.66
1116   ...
1117   ftp fec0:6666:6666::193.233.7.65
1118   ...
1119
1120 */
1121
1122static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1123                        unsigned short type,
1124                        const void *daddr, const void *saddr, unsigned len)
1125{
1126        struct ip_tunnel *t = netdev_priv(dev);
1127        struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1128        __be16 *p = (__be16*)(iph+1);
1129
1130        memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1131        p[0]            = t->parms.o_flags;
1132        p[1]            = htons(type);
1133
1134        /*
1135         *      Set the source hardware address.
1136         */
1137
1138        if (saddr)
1139                memcpy(&iph->saddr, saddr, 4);
1140
1141        if (daddr) {
1142                memcpy(&iph->daddr, daddr, 4);
1143                return t->hlen;
1144        }
1145        if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1146                return t->hlen;
1147
1148        return -t->hlen;
1149}
1150
1151static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1152{
1153        struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1154        memcpy(haddr, &iph->saddr, 4);
1155        return 4;
1156}
1157
1158static const struct header_ops ipgre_header_ops = {
1159        .create = ipgre_header,
1160        .parse  = ipgre_header_parse,
1161};
1162
1163#ifdef CONFIG_NET_IPGRE_BROADCAST
1164static int ipgre_open(struct net_device *dev)
1165{
1166        struct ip_tunnel *t = netdev_priv(dev);
1167
1168        if (ipv4_is_multicast(t->parms.iph.daddr)) {
1169                struct flowi fl = { .oif = t->parms.link,
1170                                    .nl_u = { .ip4_u =
1171                                              { .daddr = t->parms.iph.daddr,
1172                                                .saddr = t->parms.iph.saddr,
1173                                                .tos = RT_TOS(t->parms.iph.tos) } },
1174                                    .proto = IPPROTO_GRE };
1175                struct rtable *rt;
1176                if (ip_route_output_key(dev_net(dev), &rt, &fl))
1177                        return -EADDRNOTAVAIL;
1178                dev = rt->u.dst.dev;
1179                ip_rt_put(rt);
1180                if (__in_dev_get_rtnl(dev) == NULL)
1181                        return -EADDRNOTAVAIL;
1182                t->mlink = dev->ifindex;
1183                ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1184        }
1185        return 0;
1186}
1187
1188static int ipgre_close(struct net_device *dev)
1189{
1190        struct ip_tunnel *t = netdev_priv(dev);
1191
1192        if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1193                struct in_device *in_dev;
1194                in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1195                if (in_dev) {
1196                        ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1197                        in_dev_put(in_dev);
1198                }
1199        }
1200        return 0;
1201}
1202
1203#endif
1204
1205static const struct net_device_ops ipgre_netdev_ops = {
1206        .ndo_init               = ipgre_tunnel_init,
1207        .ndo_uninit             = ipgre_tunnel_uninit,
1208#ifdef CONFIG_NET_IPGRE_BROADCAST
1209        .ndo_open               = ipgre_open,
1210        .ndo_stop               = ipgre_close,
1211#endif
1212        .ndo_start_xmit         = ipgre_tunnel_xmit,
1213        .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1214        .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1215};
1216
1217static void ipgre_tunnel_setup(struct net_device *dev)
1218{
1219        dev->netdev_ops         = &ipgre_netdev_ops;
1220        dev->destructor         = free_netdev;
1221
1222        dev->type               = ARPHRD_IPGRE;
1223        dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1224        dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1225        dev->flags              = IFF_NOARP;
1226        dev->iflink             = 0;
1227        dev->addr_len           = 4;
1228        dev->features           |= NETIF_F_NETNS_LOCAL;
1229        dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1230}
1231
1232static int ipgre_tunnel_init(struct net_device *dev)
1233{
1234        struct ip_tunnel *tunnel;
1235        struct iphdr *iph;
1236
1237        tunnel = netdev_priv(dev);
1238        iph = &tunnel->parms.iph;
1239
1240        tunnel->dev = dev;
1241        strcpy(tunnel->parms.name, dev->name);
1242
1243        memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1244        memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1245
1246        if (iph->daddr) {
1247#ifdef CONFIG_NET_IPGRE_BROADCAST
1248                if (ipv4_is_multicast(iph->daddr)) {
1249                        if (!iph->saddr)
1250                                return -EINVAL;
1251                        dev->flags = IFF_BROADCAST;
1252                        dev->header_ops = &ipgre_header_ops;
1253                }
1254#endif
1255        } else
1256                dev->header_ops = &ipgre_header_ops;
1257
1258        return 0;
1259}
1260
1261static void ipgre_fb_tunnel_init(struct net_device *dev)
1262{
1263        struct ip_tunnel *tunnel = netdev_priv(dev);
1264        struct iphdr *iph = &tunnel->parms.iph;
1265        struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1266
1267        tunnel->dev = dev;
1268        strcpy(tunnel->parms.name, dev->name);
1269
1270        iph->version            = 4;
1271        iph->protocol           = IPPROTO_GRE;
1272        iph->ihl                = 5;
1273        tunnel->hlen            = sizeof(struct iphdr) + 4;
1274
1275        dev_hold(dev);
1276        ign->tunnels_wc[0]      = tunnel;
1277}
1278
1279
1280static const struct net_protocol ipgre_protocol = {
1281        .handler        =       ipgre_rcv,
1282        .err_handler    =       ipgre_err,
1283        .netns_ok       =       1,
1284};
1285
1286static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1287{
1288        int prio;
1289
1290        for (prio = 0; prio < 4; prio++) {
1291                int h;
1292                for (h = 0; h < HASH_SIZE; h++) {
1293                        struct ip_tunnel *t;
1294                        while ((t = ign->tunnels[prio][h]) != NULL)
1295                                unregister_netdevice(t->dev);
1296                }
1297        }
1298}
1299
1300static int ipgre_init_net(struct net *net)
1301{
1302        int err;
1303        struct ipgre_net *ign;
1304
1305        err = -ENOMEM;
1306        ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1307        if (ign == NULL)
1308                goto err_alloc;
1309
1310        err = net_assign_generic(net, ipgre_net_id, ign);
1311        if (err < 0)
1312                goto err_assign;
1313
1314        ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1315                                           ipgre_tunnel_setup);
1316        if (!ign->fb_tunnel_dev) {
1317                err = -ENOMEM;
1318                goto err_alloc_dev;
1319        }
1320        dev_net_set(ign->fb_tunnel_dev, net);
1321
1322        ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1323        ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1324
1325        if ((err = register_netdev(ign->fb_tunnel_dev)))
1326                goto err_reg_dev;
1327
1328        return 0;
1329
1330err_reg_dev:
1331        free_netdev(ign->fb_tunnel_dev);
1332err_alloc_dev:
1333        /* nothing */
1334err_assign:
1335        kfree(ign);
1336err_alloc:
1337        return err;
1338}
1339
1340static void ipgre_exit_net(struct net *net)
1341{
1342        struct ipgre_net *ign;
1343
1344        ign = net_generic(net, ipgre_net_id);
1345        rtnl_lock();
1346        ipgre_destroy_tunnels(ign);
1347        rtnl_unlock();
1348        kfree(ign);
1349}
1350
1351static struct pernet_operations ipgre_net_ops = {
1352        .init = ipgre_init_net,
1353        .exit = ipgre_exit_net,
1354};
1355
1356static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1357{
1358        __be16 flags;
1359
1360        if (!data)
1361                return 0;
1362
1363        flags = 0;
1364        if (data[IFLA_GRE_IFLAGS])
1365                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1366        if (data[IFLA_GRE_OFLAGS])
1367                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1368        if (flags & (GRE_VERSION|GRE_ROUTING))
1369                return -EINVAL;
1370
1371        return 0;
1372}
1373
1374static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1375{
1376        __be32 daddr;
1377
1378        if (tb[IFLA_ADDRESS]) {
1379                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1380                        return -EINVAL;
1381                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1382                        return -EADDRNOTAVAIL;
1383        }
1384
1385        if (!data)
1386                goto out;
1387
1388        if (data[IFLA_GRE_REMOTE]) {
1389                memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1390                if (!daddr)
1391                        return -EINVAL;
1392        }
1393
1394out:
1395        return ipgre_tunnel_validate(tb, data);
1396}
1397
1398static void ipgre_netlink_parms(struct nlattr *data[],
1399                                struct ip_tunnel_parm *parms)
1400{
1401        memset(parms, 0, sizeof(*parms));
1402
1403        parms->iph.protocol = IPPROTO_GRE;
1404
1405        if (!data)
1406                return;
1407
1408        if (data[IFLA_GRE_LINK])
1409                parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1410
1411        if (data[IFLA_GRE_IFLAGS])
1412                parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1413
1414        if (data[IFLA_GRE_OFLAGS])
1415                parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1416
1417        if (data[IFLA_GRE_IKEY])
1418                parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1419
1420        if (data[IFLA_GRE_OKEY])
1421                parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1422
1423        if (data[IFLA_GRE_LOCAL])
1424                parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1425
1426        if (data[IFLA_GRE_REMOTE])
1427                parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1428
1429        if (data[IFLA_GRE_TTL])
1430                parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1431
1432        if (data[IFLA_GRE_TOS])
1433                parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1434
1435        if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1436                parms->iph.frag_off = htons(IP_DF);
1437}
1438
1439static int ipgre_tap_init(struct net_device *dev)
1440{
1441        struct ip_tunnel *tunnel;
1442
1443        tunnel = netdev_priv(dev);
1444
1445        tunnel->dev = dev;
1446        strcpy(tunnel->parms.name, dev->name);
1447
1448        ipgre_tunnel_bind_dev(dev);
1449
1450        return 0;
1451}
1452
1453static const struct net_device_ops ipgre_tap_netdev_ops = {
1454        .ndo_init               = ipgre_tap_init,
1455        .ndo_uninit             = ipgre_tunnel_uninit,
1456        .ndo_start_xmit         = ipgre_tunnel_xmit,
1457        .ndo_set_mac_address    = eth_mac_addr,
1458        .ndo_validate_addr      = eth_validate_addr,
1459        .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1460};
1461
1462static void ipgre_tap_setup(struct net_device *dev)
1463{
1464
1465        ether_setup(dev);
1466
1467        dev->netdev_ops         = &ipgre_tap_netdev_ops;
1468        dev->destructor         = free_netdev;
1469
1470        dev->iflink             = 0;
1471        dev->features           |= NETIF_F_NETNS_LOCAL;
1472}
1473
1474static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1475                         struct nlattr *data[])
1476{
1477        struct ip_tunnel *nt;
1478        struct net *net = dev_net(dev);
1479        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1480        int mtu;
1481        int err;
1482
1483        nt = netdev_priv(dev);
1484        ipgre_netlink_parms(data, &nt->parms);
1485
1486        if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1487                return -EEXIST;
1488
1489        if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1490                random_ether_addr(dev->dev_addr);
1491
1492        mtu = ipgre_tunnel_bind_dev(dev);
1493        if (!tb[IFLA_MTU])
1494                dev->mtu = mtu;
1495
1496        err = register_netdevice(dev);
1497        if (err)
1498                goto out;
1499
1500        dev_hold(dev);
1501        ipgre_tunnel_link(ign, nt);
1502
1503out:
1504        return err;
1505}
1506
1507static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1508                            struct nlattr *data[])
1509{
1510        struct ip_tunnel *t, *nt;
1511        struct net *net = dev_net(dev);
1512        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1513        struct ip_tunnel_parm p;
1514        int mtu;
1515
1516        if (dev == ign->fb_tunnel_dev)
1517                return -EINVAL;
1518
1519        nt = netdev_priv(dev);
1520        ipgre_netlink_parms(data, &p);
1521
1522        t = ipgre_tunnel_locate(net, &p, 0);
1523
1524        if (t) {
1525                if (t->dev != dev)
1526                        return -EEXIST;
1527        } else {
1528                t = nt;
1529
1530                if (dev->type != ARPHRD_ETHER) {
1531                        unsigned nflags = 0;
1532
1533                        if (ipv4_is_multicast(p.iph.daddr))
1534                                nflags = IFF_BROADCAST;
1535                        else if (p.iph.daddr)
1536                                nflags = IFF_POINTOPOINT;
1537
1538                        if ((dev->flags ^ nflags) &
1539                            (IFF_POINTOPOINT | IFF_BROADCAST))
1540                                return -EINVAL;
1541                }
1542
1543                ipgre_tunnel_unlink(ign, t);
1544                t->parms.iph.saddr = p.iph.saddr;
1545                t->parms.iph.daddr = p.iph.daddr;
1546                t->parms.i_key = p.i_key;
1547                if (dev->type != ARPHRD_ETHER) {
1548                        memcpy(dev->dev_addr, &p.iph.saddr, 4);
1549                        memcpy(dev->broadcast, &p.iph.daddr, 4);
1550                }
1551                ipgre_tunnel_link(ign, t);
1552                netdev_state_change(dev);
1553        }
1554
1555        t->parms.o_key = p.o_key;
1556        t->parms.iph.ttl = p.iph.ttl;
1557        t->parms.iph.tos = p.iph.tos;
1558        t->parms.iph.frag_off = p.iph.frag_off;
1559
1560        if (t->parms.link != p.link) {
1561                t->parms.link = p.link;
1562                mtu = ipgre_tunnel_bind_dev(dev);
1563                if (!tb[IFLA_MTU])
1564                        dev->mtu = mtu;
1565                netdev_state_change(dev);
1566        }
1567
1568        return 0;
1569}
1570
1571static size_t ipgre_get_size(const struct net_device *dev)
1572{
1573        return
1574                /* IFLA_GRE_LINK */
1575                nla_total_size(4) +
1576                /* IFLA_GRE_IFLAGS */
1577                nla_total_size(2) +
1578                /* IFLA_GRE_OFLAGS */
1579                nla_total_size(2) +
1580                /* IFLA_GRE_IKEY */
1581                nla_total_size(4) +
1582                /* IFLA_GRE_OKEY */
1583                nla_total_size(4) +
1584                /* IFLA_GRE_LOCAL */
1585                nla_total_size(4) +
1586                /* IFLA_GRE_REMOTE */
1587                nla_total_size(4) +
1588                /* IFLA_GRE_TTL */
1589                nla_total_size(1) +
1590                /* IFLA_GRE_TOS */
1591                nla_total_size(1) +
1592                /* IFLA_GRE_PMTUDISC */
1593                nla_total_size(1) +
1594                0;
1595}
1596
1597static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1598{
1599        struct ip_tunnel *t = netdev_priv(dev);
1600        struct ip_tunnel_parm *p = &t->parms;
1601
1602        NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1603        NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1604        NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1605        NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1606        NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1607        NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1608        NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1609        NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1610        NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1611        NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1612
1613        return 0;
1614
1615nla_put_failure:
1616        return -EMSGSIZE;
1617}
1618
1619static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1620        [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1621        [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1622        [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1623        [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1624        [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1625        [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1626        [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1627        [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1628        [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1629        [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1630};
1631
1632static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1633        .kind           = "gre",
1634        .maxtype        = IFLA_GRE_MAX,
1635        .policy         = ipgre_policy,
1636        .priv_size      = sizeof(struct ip_tunnel),
1637        .setup          = ipgre_tunnel_setup,
1638        .validate       = ipgre_tunnel_validate,
1639        .newlink        = ipgre_newlink,
1640        .changelink     = ipgre_changelink,
1641        .get_size       = ipgre_get_size,
1642        .fill_info      = ipgre_fill_info,
1643};
1644
1645static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1646        .kind           = "gretap",
1647        .maxtype        = IFLA_GRE_MAX,
1648        .policy         = ipgre_policy,
1649        .priv_size      = sizeof(struct ip_tunnel),
1650        .setup          = ipgre_tap_setup,
1651        .validate       = ipgre_tap_validate,
1652        .newlink        = ipgre_newlink,
1653        .changelink     = ipgre_changelink,
1654        .get_size       = ipgre_get_size,
1655        .fill_info      = ipgre_fill_info,
1656};
1657
1658/*
1659 *      And now the modules code and kernel interface.
1660 */
1661
1662static int __init ipgre_init(void)
1663{
1664        int err;
1665
1666        printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1667
1668        if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1669                printk(KERN_INFO "ipgre init: can't add protocol\n");
1670                return -EAGAIN;
1671        }
1672
1673        err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1674        if (err < 0)
1675                goto gen_device_failed;
1676
1677        err = rtnl_link_register(&ipgre_link_ops);
1678        if (err < 0)
1679                goto rtnl_link_failed;
1680
1681        err = rtnl_link_register(&ipgre_tap_ops);
1682        if (err < 0)
1683                goto tap_ops_failed;
1684
1685out:
1686        return err;
1687
1688tap_ops_failed:
1689        rtnl_link_unregister(&ipgre_link_ops);
1690rtnl_link_failed:
1691        unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1692gen_device_failed:
1693        inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1694        goto out;
1695}
1696
1697static void __exit ipgre_fini(void)
1698{
1699        rtnl_link_unregister(&ipgre_tap_ops);
1700        rtnl_link_unregister(&ipgre_link_ops);
1701        unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1702        if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1703                printk(KERN_INFO "ipgre close: can't remove protocol\n");
1704}
1705
1706module_init(ipgre_init);
1707module_exit(ipgre_fini);
1708MODULE_LICENSE("GPL");
1709MODULE_ALIAS_RTNL_LINK("gre");
1710MODULE_ALIAS_RTNL_LINK("gretap");
1711