linux/net/ipv4/ip_gre.c
<<
>>
Prefs
   1/*
   2 *      Linux NET3:     GRE over IP protocol decoder.
   3 *
   4 *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
   5 *
   6 *      This program is free software; you can redistribute it and/or
   7 *      modify it under the terms of the GNU General Public License
   8 *      as published by the Free Software Foundation; either version
   9 *      2 of the License, or (at your option) any later version.
  10 *
  11 */
  12
  13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  14
  15#include <linux/capability.h>
  16#include <linux/module.h>
  17#include <linux/types.h>
  18#include <linux/kernel.h>
  19#include <linux/slab.h>
  20#include <asm/uaccess.h>
  21#include <linux/skbuff.h>
  22#include <linux/netdevice.h>
  23#include <linux/in.h>
  24#include <linux/tcp.h>
  25#include <linux/udp.h>
  26#include <linux/if_arp.h>
  27#include <linux/mroute.h>
  28#include <linux/init.h>
  29#include <linux/in6.h>
  30#include <linux/inetdevice.h>
  31#include <linux/igmp.h>
  32#include <linux/netfilter_ipv4.h>
  33#include <linux/etherdevice.h>
  34#include <linux/if_ether.h>
  35
  36#include <net/sock.h>
  37#include <net/ip.h>
  38#include <net/icmp.h>
  39#include <net/protocol.h>
  40#include <net/ipip.h>
  41#include <net/arp.h>
  42#include <net/checksum.h>
  43#include <net/dsfield.h>
  44#include <net/inet_ecn.h>
  45#include <net/xfrm.h>
  46#include <net/net_namespace.h>
  47#include <net/netns/generic.h>
  48#include <net/rtnetlink.h>
  49#include <net/gre.h>
  50
  51#if IS_ENABLED(CONFIG_IPV6)
  52#include <net/ipv6.h>
  53#include <net/ip6_fib.h>
  54#include <net/ip6_route.h>
  55#endif
  56
  57/*
  58   Problems & solutions
  59   --------------------
  60
  61   1. The most important issue is detecting local dead loops.
  62   They would cause complete host lockup in transmit, which
  63   would be "resolved" by stack overflow or, if queueing is enabled,
  64   with infinite looping in net_bh.
  65
  66   We cannot track such dead loops during route installation,
  67   it is infeasible task. The most general solutions would be
  68   to keep skb->encapsulation counter (sort of local ttl),
  69   and silently drop packet when it expires. It is a good
  70   solution, but it supposes maintaining new variable in ALL
  71   skb, even if no tunneling is used.
  72
  73   Current solution: xmit_recursion breaks dead loops. This is a percpu
  74   counter, since when we enter the first ndo_xmit(), cpu migration is
  75   forbidden. We force an exit if this counter reaches RECURSION_LIMIT
  76
  77   2. Networking dead loops would not kill routers, but would really
  78   kill network. IP hop limit plays role of "t->recursion" in this case,
  79   if we copy it from packet being encapsulated to upper header.
  80   It is very good solution, but it introduces two problems:
  81
  82   - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
  83     do not work over tunnels.
  84   - traceroute does not work. I planned to relay ICMP from tunnel,
  85     so that this problem would be solved and traceroute output
  86     would even more informative. This idea appeared to be wrong:
  87     only Linux complies to rfc1812 now (yes, guys, Linux is the only
  88     true router now :-)), all routers (at least, in neighbourhood of mine)
  89     return only 8 bytes of payload. It is the end.
  90
  91   Hence, if we want that OSPF worked or traceroute said something reasonable,
  92   we should search for another solution.
  93
  94   One of them is to parse packet trying to detect inner encapsulation
  95   made by our node. It is difficult or even impossible, especially,
  96   taking into account fragmentation. TO be short, ttl is not solution at all.
  97
  98   Current solution: The solution was UNEXPECTEDLY SIMPLE.
  99   We force DF flag on tunnels with preconfigured hop limit,
 100   that is ALL. :-) Well, it does not remove the problem completely,
 101   but exponential growth of network traffic is changed to linear
 102   (branches, that exceed pmtu are pruned) and tunnel mtu
 103   rapidly degrades to value <68, where looping stops.
 104   Yes, it is not good if there exists a router in the loop,
 105   which does not force DF, even when encapsulating packets have DF set.
 106   But it is not our problem! Nobody could accuse us, we made
 107   all that we could make. Even if it is your gated who injected
 108   fatal route to network, even if it were you who configured
 109   fatal static route: you are innocent. :-)
 110
 111
 112
 113   3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
 114   practically identical code. It would be good to glue them
 115   together, but it is not very evident, how to make them modular.
 116   sit is integral part of IPv6, ipip and gre are naturally modular.
 117   We could extract common parts (hash table, ioctl etc)
 118   to a separate module (ip_tunnel.c).
 119
 120   Alexey Kuznetsov.
 121 */
 122
 123static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 124static int ipgre_tunnel_init(struct net_device *dev);
 125static void ipgre_tunnel_setup(struct net_device *dev);
 126static int ipgre_tunnel_bind_dev(struct net_device *dev);
 127
 128/* Fallback tunnel: no source, no destination, no key, no options */
 129
 130#define HASH_SIZE  16
 131
 132static int ipgre_net_id __read_mostly;
 133struct ipgre_net {
 134        struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
 135
 136        struct net_device *fb_tunnel_dev;
 137};
 138
 139/* Tunnel hash table */
 140
 141/*
 142   4 hash tables:
 143
 144   3: (remote,local)
 145   2: (remote,*)
 146   1: (*,local)
 147   0: (*,*)
 148
 149   We require exact key match i.e. if a key is present in packet
 150   it will match only tunnel with the same key; if it is not present,
 151   it will match only keyless tunnel.
 152
 153   All keysless packets, if not matched configured keyless tunnels
 154   will match fallback tunnel.
 155 */
 156
 157#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
 158
 159#define tunnels_r_l     tunnels[3]
 160#define tunnels_r       tunnels[2]
 161#define tunnels_l       tunnels[1]
 162#define tunnels_wc      tunnels[0]
 163/*
 164 * Locking : hash tables are protected by RCU and RTNL
 165 */
 166
 167#define for_each_ip_tunnel_rcu(start) \
 168        for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
 169
 170/* often modified stats are per cpu, other are shared (netdev->stats) */
 171struct pcpu_tstats {
 172        u64     rx_packets;
 173        u64     rx_bytes;
 174        u64     tx_packets;
 175        u64     tx_bytes;
 176        struct u64_stats_sync   syncp;
 177};
 178
 179static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
 180                                                   struct rtnl_link_stats64 *tot)
 181{
 182        int i;
 183
 184        for_each_possible_cpu(i) {
 185                const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
 186                u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
 187                unsigned int start;
 188
 189                do {
 190                        start = u64_stats_fetch_begin_bh(&tstats->syncp);
 191                        rx_packets = tstats->rx_packets;
 192                        tx_packets = tstats->tx_packets;
 193                        rx_bytes = tstats->rx_bytes;
 194                        tx_bytes = tstats->tx_bytes;
 195                } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
 196
 197                tot->rx_packets += rx_packets;
 198                tot->tx_packets += tx_packets;
 199                tot->rx_bytes   += rx_bytes;
 200                tot->tx_bytes   += tx_bytes;
 201        }
 202
 203        tot->multicast = dev->stats.multicast;
 204        tot->rx_crc_errors = dev->stats.rx_crc_errors;
 205        tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
 206        tot->rx_length_errors = dev->stats.rx_length_errors;
 207        tot->rx_errors = dev->stats.rx_errors;
 208        tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
 209        tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
 210        tot->tx_dropped = dev->stats.tx_dropped;
 211        tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
 212        tot->tx_errors = dev->stats.tx_errors;
 213
 214        return tot;
 215}
 216
 217/* Given src, dst and key, find appropriate for input tunnel. */
 218
 219static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
 220                                             __be32 remote, __be32 local,
 221                                             __be32 key, __be16 gre_proto)
 222{
 223        struct net *net = dev_net(dev);
 224        int link = dev->ifindex;
 225        unsigned int h0 = HASH(remote);
 226        unsigned int h1 = HASH(key);
 227        struct ip_tunnel *t, *cand = NULL;
 228        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 229        int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
 230                       ARPHRD_ETHER : ARPHRD_IPGRE;
 231        int score, cand_score = 4;
 232
 233        for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
 234                if (local != t->parms.iph.saddr ||
 235                    remote != t->parms.iph.daddr ||
 236                    key != t->parms.i_key ||
 237                    !(t->dev->flags & IFF_UP))
 238                        continue;
 239
 240                if (t->dev->type != ARPHRD_IPGRE &&
 241                    t->dev->type != dev_type)
 242                        continue;
 243
 244                score = 0;
 245                if (t->parms.link != link)
 246                        score |= 1;
 247                if (t->dev->type != dev_type)
 248                        score |= 2;
 249                if (score == 0)
 250                        return t;
 251
 252                if (score < cand_score) {
 253                        cand = t;
 254                        cand_score = score;
 255                }
 256        }
 257
 258        for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
 259                if (remote != t->parms.iph.daddr ||
 260                    key != t->parms.i_key ||
 261                    !(t->dev->flags & IFF_UP))
 262                        continue;
 263
 264                if (t->dev->type != ARPHRD_IPGRE &&
 265                    t->dev->type != dev_type)
 266                        continue;
 267
 268                score = 0;
 269                if (t->parms.link != link)
 270                        score |= 1;
 271                if (t->dev->type != dev_type)
 272                        score |= 2;
 273                if (score == 0)
 274                        return t;
 275
 276                if (score < cand_score) {
 277                        cand = t;
 278                        cand_score = score;
 279                }
 280        }
 281
 282        for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
 283                if ((local != t->parms.iph.saddr &&
 284                     (local != t->parms.iph.daddr ||
 285                      !ipv4_is_multicast(local))) ||
 286                    key != t->parms.i_key ||
 287                    !(t->dev->flags & IFF_UP))
 288                        continue;
 289
 290                if (t->dev->type != ARPHRD_IPGRE &&
 291                    t->dev->type != dev_type)
 292                        continue;
 293
 294                score = 0;
 295                if (t->parms.link != link)
 296                        score |= 1;
 297                if (t->dev->type != dev_type)
 298                        score |= 2;
 299                if (score == 0)
 300                        return t;
 301
 302                if (score < cand_score) {
 303                        cand = t;
 304                        cand_score = score;
 305                }
 306        }
 307
 308        for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
 309                if (t->parms.i_key != key ||
 310                    !(t->dev->flags & IFF_UP))
 311                        continue;
 312
 313                if (t->dev->type != ARPHRD_IPGRE &&
 314                    t->dev->type != dev_type)
 315                        continue;
 316
 317                score = 0;
 318                if (t->parms.link != link)
 319                        score |= 1;
 320                if (t->dev->type != dev_type)
 321                        score |= 2;
 322                if (score == 0)
 323                        return t;
 324
 325                if (score < cand_score) {
 326                        cand = t;
 327                        cand_score = score;
 328                }
 329        }
 330
 331        if (cand != NULL)
 332                return cand;
 333
 334        dev = ign->fb_tunnel_dev;
 335        if (dev->flags & IFF_UP)
 336                return netdev_priv(dev);
 337
 338        return NULL;
 339}
 340
 341static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
 342                struct ip_tunnel_parm *parms)
 343{
 344        __be32 remote = parms->iph.daddr;
 345        __be32 local = parms->iph.saddr;
 346        __be32 key = parms->i_key;
 347        unsigned int h = HASH(key);
 348        int prio = 0;
 349
 350        if (local)
 351                prio |= 1;
 352        if (remote && !ipv4_is_multicast(remote)) {
 353                prio |= 2;
 354                h ^= HASH(remote);
 355        }
 356
 357        return &ign->tunnels[prio][h];
 358}
 359
 360static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
 361                struct ip_tunnel *t)
 362{
 363        return __ipgre_bucket(ign, &t->parms);
 364}
 365
 366static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
 367{
 368        struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
 369
 370        rcu_assign_pointer(t->next, rtnl_dereference(*tp));
 371        rcu_assign_pointer(*tp, t);
 372}
 373
 374static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
 375{
 376        struct ip_tunnel __rcu **tp;
 377        struct ip_tunnel *iter;
 378
 379        for (tp = ipgre_bucket(ign, t);
 380             (iter = rtnl_dereference(*tp)) != NULL;
 381             tp = &iter->next) {
 382                if (t == iter) {
 383                        rcu_assign_pointer(*tp, t->next);
 384                        break;
 385                }
 386        }
 387}
 388
 389static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
 390                                           struct ip_tunnel_parm *parms,
 391                                           int type)
 392{
 393        __be32 remote = parms->iph.daddr;
 394        __be32 local = parms->iph.saddr;
 395        __be32 key = parms->i_key;
 396        int link = parms->link;
 397        struct ip_tunnel *t;
 398        struct ip_tunnel __rcu **tp;
 399        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 400
 401        for (tp = __ipgre_bucket(ign, parms);
 402             (t = rtnl_dereference(*tp)) != NULL;
 403             tp = &t->next)
 404                if (local == t->parms.iph.saddr &&
 405                    remote == t->parms.iph.daddr &&
 406                    key == t->parms.i_key &&
 407                    link == t->parms.link &&
 408                    type == t->dev->type)
 409                        break;
 410
 411        return t;
 412}
 413
 414static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
 415                struct ip_tunnel_parm *parms, int create)
 416{
 417        struct ip_tunnel *t, *nt;
 418        struct net_device *dev;
 419        char name[IFNAMSIZ];
 420        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 421
 422        t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
 423        if (t || !create)
 424                return t;
 425
 426        if (parms->name[0])
 427                strlcpy(name, parms->name, IFNAMSIZ);
 428        else
 429                strcpy(name, "gre%d");
 430
 431        dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
 432        if (!dev)
 433                return NULL;
 434
 435        dev_net_set(dev, net);
 436
 437        nt = netdev_priv(dev);
 438        nt->parms = *parms;
 439        dev->rtnl_link_ops = &ipgre_link_ops;
 440
 441        dev->mtu = ipgre_tunnel_bind_dev(dev);
 442
 443        if (register_netdevice(dev) < 0)
 444                goto failed_free;
 445
 446        /* Can use a lockless transmit, unless we generate output sequences */
 447        if (!(nt->parms.o_flags & GRE_SEQ))
 448                dev->features |= NETIF_F_LLTX;
 449
 450        dev_hold(dev);
 451        ipgre_tunnel_link(ign, nt);
 452        return nt;
 453
 454failed_free:
 455        free_netdev(dev);
 456        return NULL;
 457}
 458
 459static void ipgre_tunnel_uninit(struct net_device *dev)
 460{
 461        struct net *net = dev_net(dev);
 462        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 463
 464        ipgre_tunnel_unlink(ign, netdev_priv(dev));
 465        dev_put(dev);
 466}
 467
 468
 469static void ipgre_err(struct sk_buff *skb, u32 info)
 470{
 471
 472/* All the routers (except for Linux) return only
 473   8 bytes of packet payload. It means, that precise relaying of
 474   ICMP in the real Internet is absolutely infeasible.
 475
 476   Moreover, Cisco "wise men" put GRE key to the third word
 477   in GRE header. It makes impossible maintaining even soft state for keyed
 478   GRE tunnels with enabled checksum. Tell them "thank you".
 479
 480   Well, I wonder, rfc1812 was written by Cisco employee,
 481   what the hell these idiots break standards established
 482   by themselves???
 483 */
 484
 485        const struct iphdr *iph = (const struct iphdr *)skb->data;
 486        __be16       *p = (__be16 *)(skb->data+(iph->ihl<<2));
 487        int grehlen = (iph->ihl<<2) + 4;
 488        const int type = icmp_hdr(skb)->type;
 489        const int code = icmp_hdr(skb)->code;
 490        struct ip_tunnel *t;
 491        __be16 flags;
 492
 493        flags = p[0];
 494        if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
 495                if (flags&(GRE_VERSION|GRE_ROUTING))
 496                        return;
 497                if (flags&GRE_KEY) {
 498                        grehlen += 4;
 499                        if (flags&GRE_CSUM)
 500                                grehlen += 4;
 501                }
 502        }
 503
 504        /* If only 8 bytes returned, keyed message will be dropped here */
 505        if (skb_headlen(skb) < grehlen)
 506                return;
 507
 508        switch (type) {
 509        default:
 510        case ICMP_PARAMETERPROB:
 511                return;
 512
 513        case ICMP_DEST_UNREACH:
 514                switch (code) {
 515                case ICMP_SR_FAILED:
 516                case ICMP_PORT_UNREACH:
 517                        /* Impossible event. */
 518                        return;
 519                case ICMP_FRAG_NEEDED:
 520                        /* Soft state for pmtu is maintained by IP core. */
 521                        return;
 522                default:
 523                        /* All others are translated to HOST_UNREACH.
 524                           rfc2003 contains "deep thoughts" about NET_UNREACH,
 525                           I believe they are just ether pollution. --ANK
 526                         */
 527                        break;
 528                }
 529                break;
 530        case ICMP_TIME_EXCEEDED:
 531                if (code != ICMP_EXC_TTL)
 532                        return;
 533                break;
 534        }
 535
 536        rcu_read_lock();
 537        t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
 538                                flags & GRE_KEY ?
 539                                *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
 540                                p[1]);
 541        if (t == NULL || t->parms.iph.daddr == 0 ||
 542            ipv4_is_multicast(t->parms.iph.daddr))
 543                goto out;
 544
 545        if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
 546                goto out;
 547
 548        if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
 549                t->err_count++;
 550        else
 551                t->err_count = 1;
 552        t->err_time = jiffies;
 553out:
 554        rcu_read_unlock();
 555}
 556
 557static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
 558{
 559        if (INET_ECN_is_ce(iph->tos)) {
 560                if (skb->protocol == htons(ETH_P_IP)) {
 561                        IP_ECN_set_ce(ip_hdr(skb));
 562                } else if (skb->protocol == htons(ETH_P_IPV6)) {
 563                        IP6_ECN_set_ce(ipv6_hdr(skb));
 564                }
 565        }
 566}
 567
 568static inline u8
 569ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
 570{
 571        u8 inner = 0;
 572        if (skb->protocol == htons(ETH_P_IP))
 573                inner = old_iph->tos;
 574        else if (skb->protocol == htons(ETH_P_IPV6))
 575                inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
 576        return INET_ECN_encapsulate(tos, inner);
 577}
 578
 579static int ipgre_rcv(struct sk_buff *skb)
 580{
 581        const struct iphdr *iph;
 582        u8     *h;
 583        __be16    flags;
 584        __sum16   csum = 0;
 585        __be32 key = 0;
 586        u32    seqno = 0;
 587        struct ip_tunnel *tunnel;
 588        int    offset = 4;
 589        __be16 gre_proto;
 590
 591        if (!pskb_may_pull(skb, 16))
 592                goto drop_nolock;
 593
 594        iph = ip_hdr(skb);
 595        h = skb->data;
 596        flags = *(__be16 *)h;
 597
 598        if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
 599                /* - Version must be 0.
 600                   - We do not support routing headers.
 601                 */
 602                if (flags&(GRE_VERSION|GRE_ROUTING))
 603                        goto drop_nolock;
 604
 605                if (flags&GRE_CSUM) {
 606                        switch (skb->ip_summed) {
 607                        case CHECKSUM_COMPLETE:
 608                                csum = csum_fold(skb->csum);
 609                                if (!csum)
 610                                        break;
 611                                /* fall through */
 612                        case CHECKSUM_NONE:
 613                                skb->csum = 0;
 614                                csum = __skb_checksum_complete(skb);
 615                                skb->ip_summed = CHECKSUM_COMPLETE;
 616                        }
 617                        offset += 4;
 618                }
 619                if (flags&GRE_KEY) {
 620                        key = *(__be32 *)(h + offset);
 621                        offset += 4;
 622                }
 623                if (flags&GRE_SEQ) {
 624                        seqno = ntohl(*(__be32 *)(h + offset));
 625                        offset += 4;
 626                }
 627        }
 628
 629        gre_proto = *(__be16 *)(h + 2);
 630
 631        rcu_read_lock();
 632        if ((tunnel = ipgre_tunnel_lookup(skb->dev,
 633                                          iph->saddr, iph->daddr, key,
 634                                          gre_proto))) {
 635                struct pcpu_tstats *tstats;
 636
 637                secpath_reset(skb);
 638
 639                skb->protocol = gre_proto;
 640                /* WCCP version 1 and 2 protocol decoding.
 641                 * - Change protocol to IP
 642                 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
 643                 */
 644                if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
 645                        skb->protocol = htons(ETH_P_IP);
 646                        if ((*(h + offset) & 0xF0) != 0x40)
 647                                offset += 4;
 648                }
 649
 650                skb->mac_header = skb->network_header;
 651                __pskb_pull(skb, offset);
 652                skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
 653                skb->pkt_type = PACKET_HOST;
 654#ifdef CONFIG_NET_IPGRE_BROADCAST
 655                if (ipv4_is_multicast(iph->daddr)) {
 656                        /* Looped back packet, drop it! */
 657                        if (rt_is_output_route(skb_rtable(skb)))
 658                                goto drop;
 659                        tunnel->dev->stats.multicast++;
 660                        skb->pkt_type = PACKET_BROADCAST;
 661                }
 662#endif
 663
 664                if (((flags&GRE_CSUM) && csum) ||
 665                    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
 666                        tunnel->dev->stats.rx_crc_errors++;
 667                        tunnel->dev->stats.rx_errors++;
 668                        goto drop;
 669                }
 670                if (tunnel->parms.i_flags&GRE_SEQ) {
 671                        if (!(flags&GRE_SEQ) ||
 672                            (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
 673                                tunnel->dev->stats.rx_fifo_errors++;
 674                                tunnel->dev->stats.rx_errors++;
 675                                goto drop;
 676                        }
 677                        tunnel->i_seqno = seqno + 1;
 678                }
 679
 680                /* Warning: All skb pointers will be invalidated! */
 681                if (tunnel->dev->type == ARPHRD_ETHER) {
 682                        if (!pskb_may_pull(skb, ETH_HLEN)) {
 683                                tunnel->dev->stats.rx_length_errors++;
 684                                tunnel->dev->stats.rx_errors++;
 685                                goto drop;
 686                        }
 687
 688                        iph = ip_hdr(skb);
 689                        skb->protocol = eth_type_trans(skb, tunnel->dev);
 690                        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 691                }
 692
 693                tstats = this_cpu_ptr(tunnel->dev->tstats);
 694                u64_stats_update_begin(&tstats->syncp);
 695                tstats->rx_packets++;
 696                tstats->rx_bytes += skb->len;
 697                u64_stats_update_end(&tstats->syncp);
 698
 699                __skb_tunnel_rx(skb, tunnel->dev);
 700
 701                skb_reset_network_header(skb);
 702                ipgre_ecn_decapsulate(iph, skb);
 703
 704                netif_rx(skb);
 705
 706                rcu_read_unlock();
 707                return 0;
 708        }
 709        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 710
 711drop:
 712        rcu_read_unlock();
 713drop_nolock:
 714        kfree_skb(skb);
 715        return 0;
 716}
 717
 718static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 719{
 720        struct ip_tunnel *tunnel = netdev_priv(dev);
 721        struct pcpu_tstats *tstats;
 722        const struct iphdr  *old_iph = ip_hdr(skb);
 723        const struct iphdr  *tiph;
 724        struct flowi4 fl4;
 725        u8     tos;
 726        __be16 df;
 727        struct rtable *rt;                      /* Route to the other host */
 728        struct net_device *tdev;                /* Device to other host */
 729        struct iphdr  *iph;                     /* Our new IP header */
 730        unsigned int max_headroom;              /* The extra header space needed */
 731        int    gre_hlen;
 732        __be32 dst;
 733        int    mtu;
 734
 735        if (dev->type == ARPHRD_ETHER)
 736                IPCB(skb)->flags = 0;
 737
 738        if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
 739                gre_hlen = 0;
 740                tiph = (const struct iphdr *)skb->data;
 741        } else {
 742                gre_hlen = tunnel->hlen;
 743                tiph = &tunnel->parms.iph;
 744        }
 745
 746        if ((dst = tiph->daddr) == 0) {
 747                /* NBMA tunnel */
 748
 749                if (skb_dst(skb) == NULL) {
 750                        dev->stats.tx_fifo_errors++;
 751                        goto tx_error;
 752                }
 753
 754                if (skb->protocol == htons(ETH_P_IP)) {
 755                        rt = skb_rtable(skb);
 756                        dst = rt->rt_gateway;
 757                }
 758#if IS_ENABLED(CONFIG_IPV6)
 759                else if (skb->protocol == htons(ETH_P_IPV6)) {
 760                        const struct in6_addr *addr6;
 761                        struct neighbour *neigh;
 762                        bool do_tx_error_icmp;
 763                        int addr_type;
 764
 765                        neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
 766                        if (neigh == NULL)
 767                                goto tx_error;
 768
 769                        addr6 = (const struct in6_addr *)&neigh->primary_key;
 770                        addr_type = ipv6_addr_type(addr6);
 771
 772                        if (addr_type == IPV6_ADDR_ANY) {
 773                                addr6 = &ipv6_hdr(skb)->daddr;
 774                                addr_type = ipv6_addr_type(addr6);
 775                        }
 776
 777                        if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
 778                                do_tx_error_icmp = true;
 779                        else {
 780                                do_tx_error_icmp = false;
 781                                dst = addr6->s6_addr32[3];
 782                        }
 783                        neigh_release(neigh);
 784                        if (do_tx_error_icmp)
 785                                goto tx_error_icmp;
 786                }
 787#endif
 788                else
 789                        goto tx_error;
 790        }
 791
 792        tos = tiph->tos;
 793        if (tos == 1) {
 794                tos = 0;
 795                if (skb->protocol == htons(ETH_P_IP))
 796                        tos = old_iph->tos;
 797                else if (skb->protocol == htons(ETH_P_IPV6))
 798                        tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
 799        }
 800
 801        rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
 802                                 tunnel->parms.o_key, RT_TOS(tos),
 803                                 tunnel->parms.link);
 804        if (IS_ERR(rt)) {
 805                dev->stats.tx_carrier_errors++;
 806                goto tx_error;
 807        }
 808        tdev = rt->dst.dev;
 809
 810        if (tdev == dev) {
 811                ip_rt_put(rt);
 812                dev->stats.collisions++;
 813                goto tx_error;
 814        }
 815
 816        df = tiph->frag_off;
 817        if (df)
 818                mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
 819        else
 820                mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 821
 822        if (skb_dst(skb))
 823                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 824
 825        if (skb->protocol == htons(ETH_P_IP)) {
 826                df |= (old_iph->frag_off&htons(IP_DF));
 827
 828                if ((old_iph->frag_off&htons(IP_DF)) &&
 829                    mtu < ntohs(old_iph->tot_len)) {
 830                        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 831                        ip_rt_put(rt);
 832                        goto tx_error;
 833                }
 834        }
 835#if IS_ENABLED(CONFIG_IPV6)
 836        else if (skb->protocol == htons(ETH_P_IPV6)) {
 837                struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
 838
 839                if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
 840                        if ((tunnel->parms.iph.daddr &&
 841                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
 842                            rt6->rt6i_dst.plen == 128) {
 843                                rt6->rt6i_flags |= RTF_MODIFIED;
 844                                dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
 845                        }
 846                }
 847
 848                if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
 849                        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 850                        ip_rt_put(rt);
 851                        goto tx_error;
 852                }
 853        }
 854#endif
 855
 856        if (tunnel->err_count > 0) {
 857                if (time_before(jiffies,
 858                                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
 859                        tunnel->err_count--;
 860
 861                        dst_link_failure(skb);
 862                } else
 863                        tunnel->err_count = 0;
 864        }
 865
 866        max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
 867
 868        if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
 869            (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
 870                struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
 871                if (max_headroom > dev->needed_headroom)
 872                        dev->needed_headroom = max_headroom;
 873                if (!new_skb) {
 874                        ip_rt_put(rt);
 875                        dev->stats.tx_dropped++;
 876                        dev_kfree_skb(skb);
 877                        return NETDEV_TX_OK;
 878                }
 879                if (skb->sk)
 880                        skb_set_owner_w(new_skb, skb->sk);
 881                dev_kfree_skb(skb);
 882                skb = new_skb;
 883                old_iph = ip_hdr(skb);
 884        }
 885
 886        skb_reset_transport_header(skb);
 887        skb_push(skb, gre_hlen);
 888        skb_reset_network_header(skb);
 889        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 890        IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
 891                              IPSKB_REROUTED);
 892        skb_dst_drop(skb);
 893        skb_dst_set(skb, &rt->dst);
 894
 895        /*
 896         *      Push down and install the IPIP header.
 897         */
 898
 899        iph                     =       ip_hdr(skb);
 900        iph->version            =       4;
 901        iph->ihl                =       sizeof(struct iphdr) >> 2;
 902        iph->frag_off           =       df;
 903        iph->protocol           =       IPPROTO_GRE;
 904        iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
 905        iph->daddr              =       fl4.daddr;
 906        iph->saddr              =       fl4.saddr;
 907
 908        if ((iph->ttl = tiph->ttl) == 0) {
 909                if (skb->protocol == htons(ETH_P_IP))
 910                        iph->ttl = old_iph->ttl;
 911#if IS_ENABLED(CONFIG_IPV6)
 912                else if (skb->protocol == htons(ETH_P_IPV6))
 913                        iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
 914#endif
 915                else
 916                        iph->ttl = ip4_dst_hoplimit(&rt->dst);
 917        }
 918
 919        ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
 920        ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
 921                                   htons(ETH_P_TEB) : skb->protocol;
 922
 923        if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
 924                __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
 925
 926                if (tunnel->parms.o_flags&GRE_SEQ) {
 927                        ++tunnel->o_seqno;
 928                        *ptr = htonl(tunnel->o_seqno);
 929                        ptr--;
 930                }
 931                if (tunnel->parms.o_flags&GRE_KEY) {
 932                        *ptr = tunnel->parms.o_key;
 933                        ptr--;
 934                }
 935                if (tunnel->parms.o_flags&GRE_CSUM) {
 936                        *ptr = 0;
 937                        *(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
 938                }
 939        }
 940
 941        nf_reset(skb);
 942        tstats = this_cpu_ptr(dev->tstats);
 943        __IPTUNNEL_XMIT(tstats, &dev->stats);
 944        return NETDEV_TX_OK;
 945
 946#if IS_ENABLED(CONFIG_IPV6)
 947tx_error_icmp:
 948        dst_link_failure(skb);
 949#endif
 950tx_error:
 951        dev->stats.tx_errors++;
 952        dev_kfree_skb(skb);
 953        return NETDEV_TX_OK;
 954}
 955
 956static int ipgre_tunnel_bind_dev(struct net_device *dev)
 957{
 958        struct net_device *tdev = NULL;
 959        struct ip_tunnel *tunnel;
 960        const struct iphdr *iph;
 961        int hlen = LL_MAX_HEADER;
 962        int mtu = ETH_DATA_LEN;
 963        int addend = sizeof(struct iphdr) + 4;
 964
 965        tunnel = netdev_priv(dev);
 966        iph = &tunnel->parms.iph;
 967
 968        /* Guess output device to choose reasonable mtu and needed_headroom */
 969
 970        if (iph->daddr) {
 971                struct flowi4 fl4;
 972                struct rtable *rt;
 973
 974                rt = ip_route_output_gre(dev_net(dev), &fl4,
 975                                         iph->daddr, iph->saddr,
 976                                         tunnel->parms.o_key,
 977                                         RT_TOS(iph->tos),
 978                                         tunnel->parms.link);
 979                if (!IS_ERR(rt)) {
 980                        tdev = rt->dst.dev;
 981                        ip_rt_put(rt);
 982                }
 983
 984                if (dev->type != ARPHRD_ETHER)
 985                        dev->flags |= IFF_POINTOPOINT;
 986        }
 987
 988        if (!tdev && tunnel->parms.link)
 989                tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
 990
 991        if (tdev) {
 992                hlen = tdev->hard_header_len + tdev->needed_headroom;
 993                mtu = tdev->mtu;
 994        }
 995        dev->iflink = tunnel->parms.link;
 996
 997        /* Precalculate GRE options length */
 998        if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
 999                if (tunnel->parms.o_flags&GRE_CSUM)
1000                        addend += 4;
1001                if (tunnel->parms.o_flags&GRE_KEY)
1002                        addend += 4;
1003                if (tunnel->parms.o_flags&GRE_SEQ)
1004                        addend += 4;
1005        }
1006        dev->needed_headroom = addend + hlen;
1007        mtu -= dev->hard_header_len + addend;
1008
1009        if (mtu < 68)
1010                mtu = 68;
1011
1012        tunnel->hlen = addend;
1013
1014        return mtu;
1015}
1016
1017static int
1018ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1019{
1020        int err = 0;
1021        struct ip_tunnel_parm p;
1022        struct ip_tunnel *t;
1023        struct net *net = dev_net(dev);
1024        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1025
1026        switch (cmd) {
1027        case SIOCGETTUNNEL:
1028                t = NULL;
1029                if (dev == ign->fb_tunnel_dev) {
1030                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1031                                err = -EFAULT;
1032                                break;
1033                        }
1034                        t = ipgre_tunnel_locate(net, &p, 0);
1035                }
1036                if (t == NULL)
1037                        t = netdev_priv(dev);
1038                memcpy(&p, &t->parms, sizeof(p));
1039                if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1040                        err = -EFAULT;
1041                break;
1042
1043        case SIOCADDTUNNEL:
1044        case SIOCCHGTUNNEL:
1045                err = -EPERM;
1046                if (!capable(CAP_NET_ADMIN))
1047                        goto done;
1048
1049                err = -EFAULT;
1050                if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1051                        goto done;
1052
1053                err = -EINVAL;
1054                if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1055                    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1056                    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1057                        goto done;
1058                if (p.iph.ttl)
1059                        p.iph.frag_off |= htons(IP_DF);
1060
1061                if (!(p.i_flags&GRE_KEY))
1062                        p.i_key = 0;
1063                if (!(p.o_flags&GRE_KEY))
1064                        p.o_key = 0;
1065
1066                t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1067
1068                if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1069                        if (t != NULL) {
1070                                if (t->dev != dev) {
1071                                        err = -EEXIST;
1072                                        break;
1073                                }
1074                        } else {
1075                                unsigned int nflags = 0;
1076
1077                                t = netdev_priv(dev);
1078
1079                                if (ipv4_is_multicast(p.iph.daddr))
1080                                        nflags = IFF_BROADCAST;
1081                                else if (p.iph.daddr)
1082                                        nflags = IFF_POINTOPOINT;
1083
1084                                if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1085                                        err = -EINVAL;
1086                                        break;
1087                                }
1088                                ipgre_tunnel_unlink(ign, t);
1089                                synchronize_net();
1090                                t->parms.iph.saddr = p.iph.saddr;
1091                                t->parms.iph.daddr = p.iph.daddr;
1092                                t->parms.i_key = p.i_key;
1093                                t->parms.o_key = p.o_key;
1094                                memcpy(dev->dev_addr, &p.iph.saddr, 4);
1095                                memcpy(dev->broadcast, &p.iph.daddr, 4);
1096                                ipgre_tunnel_link(ign, t);
1097                                netdev_state_change(dev);
1098                        }
1099                }
1100
1101                if (t) {
1102                        err = 0;
1103                        if (cmd == SIOCCHGTUNNEL) {
1104                                t->parms.iph.ttl = p.iph.ttl;
1105                                t->parms.iph.tos = p.iph.tos;
1106                                t->parms.iph.frag_off = p.iph.frag_off;
1107                                if (t->parms.link != p.link) {
1108                                        t->parms.link = p.link;
1109                                        dev->mtu = ipgre_tunnel_bind_dev(dev);
1110                                        netdev_state_change(dev);
1111                                }
1112                        }
1113                        if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1114                                err = -EFAULT;
1115                } else
1116                        err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1117                break;
1118
1119        case SIOCDELTUNNEL:
1120                err = -EPERM;
1121                if (!capable(CAP_NET_ADMIN))
1122                        goto done;
1123
1124                if (dev == ign->fb_tunnel_dev) {
1125                        err = -EFAULT;
1126                        if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1127                                goto done;
1128                        err = -ENOENT;
1129                        if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1130                                goto done;
1131                        err = -EPERM;
1132                        if (t == netdev_priv(ign->fb_tunnel_dev))
1133                                goto done;
1134                        dev = t->dev;
1135                }
1136                unregister_netdevice(dev);
1137                err = 0;
1138                break;
1139
1140        default:
1141                err = -EINVAL;
1142        }
1143
1144done:
1145        return err;
1146}
1147
1148static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1149{
1150        struct ip_tunnel *tunnel = netdev_priv(dev);
1151        if (new_mtu < 68 ||
1152            new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1153                return -EINVAL;
1154        dev->mtu = new_mtu;
1155        return 0;
1156}
1157
1158/* Nice toy. Unfortunately, useless in real life :-)
1159   It allows to construct virtual multiprotocol broadcast "LAN"
1160   over the Internet, provided multicast routing is tuned.
1161
1162
1163   I have no idea was this bicycle invented before me,
1164   so that I had to set ARPHRD_IPGRE to a random value.
1165   I have an impression, that Cisco could make something similar,
1166   but this feature is apparently missing in IOS<=11.2(8).
1167
1168   I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1169   with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1170
1171   ping -t 255 224.66.66.66
1172
1173   If nobody answers, mbone does not work.
1174
1175   ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1176   ip addr add 10.66.66.<somewhat>/24 dev Universe
1177   ifconfig Universe up
1178   ifconfig Universe add fe80::<Your_real_addr>/10
1179   ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1180   ftp 10.66.66.66
1181   ...
1182   ftp fec0:6666:6666::193.233.7.65
1183   ...
1184
1185 */
1186
1187static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1188                        unsigned short type,
1189                        const void *daddr, const void *saddr, unsigned int len)
1190{
1191        struct ip_tunnel *t = netdev_priv(dev);
1192        struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1193        __be16 *p = (__be16 *)(iph+1);
1194
1195        memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1196        p[0]            = t->parms.o_flags;
1197        p[1]            = htons(type);
1198
1199        /*
1200         *      Set the source hardware address.
1201         */
1202
1203        if (saddr)
1204                memcpy(&iph->saddr, saddr, 4);
1205        if (daddr)
1206                memcpy(&iph->daddr, daddr, 4);
1207        if (iph->daddr)
1208                return t->hlen;
1209
1210        return -t->hlen;
1211}
1212
1213static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1214{
1215        const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1216        memcpy(haddr, &iph->saddr, 4);
1217        return 4;
1218}
1219
1220static const struct header_ops ipgre_header_ops = {
1221        .create = ipgre_header,
1222        .parse  = ipgre_header_parse,
1223};
1224
1225#ifdef CONFIG_NET_IPGRE_BROADCAST
1226static int ipgre_open(struct net_device *dev)
1227{
1228        struct ip_tunnel *t = netdev_priv(dev);
1229
1230        if (ipv4_is_multicast(t->parms.iph.daddr)) {
1231                struct flowi4 fl4;
1232                struct rtable *rt;
1233
1234                rt = ip_route_output_gre(dev_net(dev), &fl4,
1235                                         t->parms.iph.daddr,
1236                                         t->parms.iph.saddr,
1237                                         t->parms.o_key,
1238                                         RT_TOS(t->parms.iph.tos),
1239                                         t->parms.link);
1240                if (IS_ERR(rt))
1241                        return -EADDRNOTAVAIL;
1242                dev = rt->dst.dev;
1243                ip_rt_put(rt);
1244                if (__in_dev_get_rtnl(dev) == NULL)
1245                        return -EADDRNOTAVAIL;
1246                t->mlink = dev->ifindex;
1247                ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1248        }
1249        return 0;
1250}
1251
1252static int ipgre_close(struct net_device *dev)
1253{
1254        struct ip_tunnel *t = netdev_priv(dev);
1255
1256        if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1257                struct in_device *in_dev;
1258                in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1259                if (in_dev)
1260                        ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1261        }
1262        return 0;
1263}
1264
1265#endif
1266
1267static const struct net_device_ops ipgre_netdev_ops = {
1268        .ndo_init               = ipgre_tunnel_init,
1269        .ndo_uninit             = ipgre_tunnel_uninit,
1270#ifdef CONFIG_NET_IPGRE_BROADCAST
1271        .ndo_open               = ipgre_open,
1272        .ndo_stop               = ipgre_close,
1273#endif
1274        .ndo_start_xmit         = ipgre_tunnel_xmit,
1275        .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1276        .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1277        .ndo_get_stats64        = ipgre_get_stats64,
1278};
1279
1280static void ipgre_dev_free(struct net_device *dev)
1281{
1282        free_percpu(dev->tstats);
1283        free_netdev(dev);
1284}
1285
1286static void ipgre_tunnel_setup(struct net_device *dev)
1287{
1288        dev->netdev_ops         = &ipgre_netdev_ops;
1289        dev->destructor         = ipgre_dev_free;
1290
1291        dev->type               = ARPHRD_IPGRE;
1292        dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1293        dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1294        dev->flags              = IFF_NOARP;
1295        dev->iflink             = 0;
1296        dev->addr_len           = 4;
1297        dev->features           |= NETIF_F_NETNS_LOCAL;
1298        dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1299}
1300
1301static int ipgre_tunnel_init(struct net_device *dev)
1302{
1303        struct ip_tunnel *tunnel;
1304        struct iphdr *iph;
1305
1306        tunnel = netdev_priv(dev);
1307        iph = &tunnel->parms.iph;
1308
1309        tunnel->dev = dev;
1310        strcpy(tunnel->parms.name, dev->name);
1311
1312        memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1313        memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1314
1315        if (iph->daddr) {
1316#ifdef CONFIG_NET_IPGRE_BROADCAST
1317                if (ipv4_is_multicast(iph->daddr)) {
1318                        if (!iph->saddr)
1319                                return -EINVAL;
1320                        dev->flags = IFF_BROADCAST;
1321                        dev->header_ops = &ipgre_header_ops;
1322                }
1323#endif
1324        } else
1325                dev->header_ops = &ipgre_header_ops;
1326
1327        dev->tstats = alloc_percpu(struct pcpu_tstats);
1328        if (!dev->tstats)
1329                return -ENOMEM;
1330
1331        return 0;
1332}
1333
1334static void ipgre_fb_tunnel_init(struct net_device *dev)
1335{
1336        struct ip_tunnel *tunnel = netdev_priv(dev);
1337        struct iphdr *iph = &tunnel->parms.iph;
1338
1339        tunnel->dev = dev;
1340        strcpy(tunnel->parms.name, dev->name);
1341
1342        iph->version            = 4;
1343        iph->protocol           = IPPROTO_GRE;
1344        iph->ihl                = 5;
1345        tunnel->hlen            = sizeof(struct iphdr) + 4;
1346
1347        dev_hold(dev);
1348}
1349
1350
1351static const struct gre_protocol ipgre_protocol = {
1352        .handler     = ipgre_rcv,
1353        .err_handler = ipgre_err,
1354};
1355
1356static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1357{
1358        int prio;
1359
1360        for (prio = 0; prio < 4; prio++) {
1361                int h;
1362                for (h = 0; h < HASH_SIZE; h++) {
1363                        struct ip_tunnel *t;
1364
1365                        t = rtnl_dereference(ign->tunnels[prio][h]);
1366
1367                        while (t != NULL) {
1368                                unregister_netdevice_queue(t->dev, head);
1369                                t = rtnl_dereference(t->next);
1370                        }
1371                }
1372        }
1373}
1374
1375static int __net_init ipgre_init_net(struct net *net)
1376{
1377        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1378        int err;
1379
1380        ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1381                                           ipgre_tunnel_setup);
1382        if (!ign->fb_tunnel_dev) {
1383                err = -ENOMEM;
1384                goto err_alloc_dev;
1385        }
1386        dev_net_set(ign->fb_tunnel_dev, net);
1387
1388        ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1389        ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1390
1391        if ((err = register_netdev(ign->fb_tunnel_dev)))
1392                goto err_reg_dev;
1393
1394        rcu_assign_pointer(ign->tunnels_wc[0],
1395                           netdev_priv(ign->fb_tunnel_dev));
1396        return 0;
1397
1398err_reg_dev:
1399        ipgre_dev_free(ign->fb_tunnel_dev);
1400err_alloc_dev:
1401        return err;
1402}
1403
1404static void __net_exit ipgre_exit_net(struct net *net)
1405{
1406        struct ipgre_net *ign;
1407        LIST_HEAD(list);
1408
1409        ign = net_generic(net, ipgre_net_id);
1410        rtnl_lock();
1411        ipgre_destroy_tunnels(ign, &list);
1412        unregister_netdevice_many(&list);
1413        rtnl_unlock();
1414}
1415
1416static struct pernet_operations ipgre_net_ops = {
1417        .init = ipgre_init_net,
1418        .exit = ipgre_exit_net,
1419        .id   = &ipgre_net_id,
1420        .size = sizeof(struct ipgre_net),
1421};
1422
1423static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1424{
1425        __be16 flags;
1426
1427        if (!data)
1428                return 0;
1429
1430        flags = 0;
1431        if (data[IFLA_GRE_IFLAGS])
1432                flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1433        if (data[IFLA_GRE_OFLAGS])
1434                flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1435        if (flags & (GRE_VERSION|GRE_ROUTING))
1436                return -EINVAL;
1437
1438        return 0;
1439}
1440
1441static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1442{
1443        __be32 daddr;
1444
1445        if (tb[IFLA_ADDRESS]) {
1446                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1447                        return -EINVAL;
1448                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1449                        return -EADDRNOTAVAIL;
1450        }
1451
1452        if (!data)
1453                goto out;
1454
1455        if (data[IFLA_GRE_REMOTE]) {
1456                memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1457                if (!daddr)
1458                        return -EINVAL;
1459        }
1460
1461out:
1462        return ipgre_tunnel_validate(tb, data);
1463}
1464
1465static void ipgre_netlink_parms(struct nlattr *data[],
1466                                struct ip_tunnel_parm *parms)
1467{
1468        memset(parms, 0, sizeof(*parms));
1469
1470        parms->iph.protocol = IPPROTO_GRE;
1471
1472        if (!data)
1473                return;
1474
1475        if (data[IFLA_GRE_LINK])
1476                parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1477
1478        if (data[IFLA_GRE_IFLAGS])
1479                parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1480
1481        if (data[IFLA_GRE_OFLAGS])
1482                parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1483
1484        if (data[IFLA_GRE_IKEY])
1485                parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1486
1487        if (data[IFLA_GRE_OKEY])
1488                parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1489
1490        if (data[IFLA_GRE_LOCAL])
1491                parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1492
1493        if (data[IFLA_GRE_REMOTE])
1494                parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1495
1496        if (data[IFLA_GRE_TTL])
1497                parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1498
1499        if (data[IFLA_GRE_TOS])
1500                parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1501
1502        if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1503                parms->iph.frag_off = htons(IP_DF);
1504}
1505
1506static int ipgre_tap_init(struct net_device *dev)
1507{
1508        struct ip_tunnel *tunnel;
1509
1510        tunnel = netdev_priv(dev);
1511
1512        tunnel->dev = dev;
1513        strcpy(tunnel->parms.name, dev->name);
1514
1515        ipgre_tunnel_bind_dev(dev);
1516
1517        dev->tstats = alloc_percpu(struct pcpu_tstats);
1518        if (!dev->tstats)
1519                return -ENOMEM;
1520
1521        return 0;
1522}
1523
1524static const struct net_device_ops ipgre_tap_netdev_ops = {
1525        .ndo_init               = ipgre_tap_init,
1526        .ndo_uninit             = ipgre_tunnel_uninit,
1527        .ndo_start_xmit         = ipgre_tunnel_xmit,
1528        .ndo_set_mac_address    = eth_mac_addr,
1529        .ndo_validate_addr      = eth_validate_addr,
1530        .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1531        .ndo_get_stats64        = ipgre_get_stats64,
1532};
1533
1534static void ipgre_tap_setup(struct net_device *dev)
1535{
1536
1537        ether_setup(dev);
1538
1539        dev->netdev_ops         = &ipgre_tap_netdev_ops;
1540        dev->destructor         = ipgre_dev_free;
1541
1542        dev->iflink             = 0;
1543        dev->features           |= NETIF_F_NETNS_LOCAL;
1544}
1545
1546static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1547                         struct nlattr *data[])
1548{
1549        struct ip_tunnel *nt;
1550        struct net *net = dev_net(dev);
1551        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1552        int mtu;
1553        int err;
1554
1555        nt = netdev_priv(dev);
1556        ipgre_netlink_parms(data, &nt->parms);
1557
1558        if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1559                return -EEXIST;
1560
1561        if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1562                eth_hw_addr_random(dev);
1563
1564        mtu = ipgre_tunnel_bind_dev(dev);
1565        if (!tb[IFLA_MTU])
1566                dev->mtu = mtu;
1567
1568        /* Can use a lockless transmit, unless we generate output sequences */
1569        if (!(nt->parms.o_flags & GRE_SEQ))
1570                dev->features |= NETIF_F_LLTX;
1571
1572        err = register_netdevice(dev);
1573        if (err)
1574                goto out;
1575
1576        dev_hold(dev);
1577        ipgre_tunnel_link(ign, nt);
1578
1579out:
1580        return err;
1581}
1582
1583static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1584                            struct nlattr *data[])
1585{
1586        struct ip_tunnel *t, *nt;
1587        struct net *net = dev_net(dev);
1588        struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1589        struct ip_tunnel_parm p;
1590        int mtu;
1591
1592        if (dev == ign->fb_tunnel_dev)
1593                return -EINVAL;
1594
1595        nt = netdev_priv(dev);
1596        ipgre_netlink_parms(data, &p);
1597
1598        t = ipgre_tunnel_locate(net, &p, 0);
1599
1600        if (t) {
1601                if (t->dev != dev)
1602                        return -EEXIST;
1603        } else {
1604                t = nt;
1605
1606                if (dev->type != ARPHRD_ETHER) {
1607                        unsigned int nflags = 0;
1608
1609                        if (ipv4_is_multicast(p.iph.daddr))
1610                                nflags = IFF_BROADCAST;
1611                        else if (p.iph.daddr)
1612                                nflags = IFF_POINTOPOINT;
1613
1614                        if ((dev->flags ^ nflags) &
1615                            (IFF_POINTOPOINT | IFF_BROADCAST))
1616                                return -EINVAL;
1617                }
1618
1619                ipgre_tunnel_unlink(ign, t);
1620                t->parms.iph.saddr = p.iph.saddr;
1621                t->parms.iph.daddr = p.iph.daddr;
1622                t->parms.i_key = p.i_key;
1623                if (dev->type != ARPHRD_ETHER) {
1624                        memcpy(dev->dev_addr, &p.iph.saddr, 4);
1625                        memcpy(dev->broadcast, &p.iph.daddr, 4);
1626                }
1627                ipgre_tunnel_link(ign, t);
1628                netdev_state_change(dev);
1629        }
1630
1631        t->parms.o_key = p.o_key;
1632        t->parms.iph.ttl = p.iph.ttl;
1633        t->parms.iph.tos = p.iph.tos;
1634        t->parms.iph.frag_off = p.iph.frag_off;
1635
1636        if (t->parms.link != p.link) {
1637                t->parms.link = p.link;
1638                mtu = ipgre_tunnel_bind_dev(dev);
1639                if (!tb[IFLA_MTU])
1640                        dev->mtu = mtu;
1641                netdev_state_change(dev);
1642        }
1643
1644        return 0;
1645}
1646
1647static size_t ipgre_get_size(const struct net_device *dev)
1648{
1649        return
1650                /* IFLA_GRE_LINK */
1651                nla_total_size(4) +
1652                /* IFLA_GRE_IFLAGS */
1653                nla_total_size(2) +
1654                /* IFLA_GRE_OFLAGS */
1655                nla_total_size(2) +
1656                /* IFLA_GRE_IKEY */
1657                nla_total_size(4) +
1658                /* IFLA_GRE_OKEY */
1659                nla_total_size(4) +
1660                /* IFLA_GRE_LOCAL */
1661                nla_total_size(4) +
1662                /* IFLA_GRE_REMOTE */
1663                nla_total_size(4) +
1664                /* IFLA_GRE_TTL */
1665                nla_total_size(1) +
1666                /* IFLA_GRE_TOS */
1667                nla_total_size(1) +
1668                /* IFLA_GRE_PMTUDISC */
1669                nla_total_size(1) +
1670                0;
1671}
1672
1673static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1674{
1675        struct ip_tunnel *t = netdev_priv(dev);
1676        struct ip_tunnel_parm *p = &t->parms;
1677
1678        if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1679            nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
1680            nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
1681            nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1682            nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1683            nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1684            nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1685            nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1686            nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1687            nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1688                       !!(p->iph.frag_off & htons(IP_DF))))
1689                goto nla_put_failure;
1690        return 0;
1691
1692nla_put_failure:
1693        return -EMSGSIZE;
1694}
1695
1696static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1697        [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1698        [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1699        [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1700        [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1701        [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1702        [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1703        [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1704        [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1705        [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1706        [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1707};
1708
1709static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1710        .kind           = "gre",
1711        .maxtype        = IFLA_GRE_MAX,
1712        .policy         = ipgre_policy,
1713        .priv_size      = sizeof(struct ip_tunnel),
1714        .setup          = ipgre_tunnel_setup,
1715        .validate       = ipgre_tunnel_validate,
1716        .newlink        = ipgre_newlink,
1717        .changelink     = ipgre_changelink,
1718        .get_size       = ipgre_get_size,
1719        .fill_info      = ipgre_fill_info,
1720};
1721
1722static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1723        .kind           = "gretap",
1724        .maxtype        = IFLA_GRE_MAX,
1725        .policy         = ipgre_policy,
1726        .priv_size      = sizeof(struct ip_tunnel),
1727        .setup          = ipgre_tap_setup,
1728        .validate       = ipgre_tap_validate,
1729        .newlink        = ipgre_newlink,
1730        .changelink     = ipgre_changelink,
1731        .get_size       = ipgre_get_size,
1732        .fill_info      = ipgre_fill_info,
1733};
1734
1735/*
1736 *      And now the modules code and kernel interface.
1737 */
1738
1739static int __init ipgre_init(void)
1740{
1741        int err;
1742
1743        pr_info("GRE over IPv4 tunneling driver\n");
1744
1745        err = register_pernet_device(&ipgre_net_ops);
1746        if (err < 0)
1747                return err;
1748
1749        err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1750        if (err < 0) {
1751                pr_info("%s: can't add protocol\n", __func__);
1752                goto add_proto_failed;
1753        }
1754
1755        err = rtnl_link_register(&ipgre_link_ops);
1756        if (err < 0)
1757                goto rtnl_link_failed;
1758
1759        err = rtnl_link_register(&ipgre_tap_ops);
1760        if (err < 0)
1761                goto tap_ops_failed;
1762
1763out:
1764        return err;
1765
1766tap_ops_failed:
1767        rtnl_link_unregister(&ipgre_link_ops);
1768rtnl_link_failed:
1769        gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1770add_proto_failed:
1771        unregister_pernet_device(&ipgre_net_ops);
1772        goto out;
1773}
1774
1775static void __exit ipgre_fini(void)
1776{
1777        rtnl_link_unregister(&ipgre_tap_ops);
1778        rtnl_link_unregister(&ipgre_link_ops);
1779        if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1780                pr_info("%s: can't remove protocol\n", __func__);
1781        unregister_pernet_device(&ipgre_net_ops);
1782}
1783
1784module_init(ipgre_init);
1785module_exit(ipgre_fini);
1786MODULE_LICENSE("GPL");
1787MODULE_ALIAS_RTNL_LINK("gre");
1788MODULE_ALIAS_RTNL_LINK("gretap");
1789MODULE_ALIAS_NETDEV("gre0");
1790