linux/net/ipv4/ip_tunnel.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2013 Nicira, Inc.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of version 2 of the GNU General Public
   6 * License as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful, but
   9 * WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public License
  14 * along with this program; if not, write to the Free Software
  15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16 * 02110-1301, USA
  17 */
  18
  19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  20
  21#include <linux/capability.h>
  22#include <linux/module.h>
  23#include <linux/types.h>
  24#include <linux/kernel.h>
  25#include <linux/slab.h>
  26#include <linux/uaccess.h>
  27#include <linux/skbuff.h>
  28#include <linux/netdevice.h>
  29#include <linux/in.h>
  30#include <linux/tcp.h>
  31#include <linux/udp.h>
  32#include <linux/if_arp.h>
  33#include <linux/mroute.h>
  34#include <linux/init.h>
  35#include <linux/in6.h>
  36#include <linux/inetdevice.h>
  37#include <linux/igmp.h>
  38#include <linux/netfilter_ipv4.h>
  39#include <linux/etherdevice.h>
  40#include <linux/if_ether.h>
  41#include <linux/if_vlan.h>
  42#include <linux/rculist.h>
  43
  44#include <net/sock.h>
  45#include <net/ip.h>
  46#include <net/icmp.h>
  47#include <net/protocol.h>
  48#include <net/ip_tunnels.h>
  49#include <net/arp.h>
  50#include <net/checksum.h>
  51#include <net/dsfield.h>
  52#include <net/inet_ecn.h>
  53#include <net/xfrm.h>
  54#include <net/net_namespace.h>
  55#include <net/netns/generic.h>
  56#include <net/rtnetlink.h>
  57#include <net/udp.h>
  58
  59#if IS_ENABLED(CONFIG_IPV6)
  60#include <net/ipv6.h>
  61#include <net/ip6_fib.h>
  62#include <net/ip6_route.h>
  63#endif
  64
  65static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
  66{
  67        return hash_32((__force u32)key ^ (__force u32)remote,
  68                         IP_TNL_HASH_BITS);
  69}
  70
  71static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
  72                                __be16 flags, __be32 key)
  73{
  74        if (p->i_flags & TUNNEL_KEY) {
  75                if (flags & TUNNEL_KEY)
  76                        return key == p->i_key;
  77                else
  78                        /* key expected, none present */
  79                        return false;
  80        } else
  81                return !(flags & TUNNEL_KEY);
  82}
  83
  84/* Fallback tunnel: no source, no destination, no key, no options
  85
  86   Tunnel hash table:
  87   We require exact key match i.e. if a key is present in packet
  88   it will match only tunnel with the same key; if it is not present,
  89   it will match only keyless tunnel.
  90
  91   All keysless packets, if not matched configured keyless tunnels
  92   will match fallback tunnel.
  93   Given src, dst and key, find appropriate for input tunnel.
  94*/
  95struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
  96                                   int link, __be16 flags,
  97                                   __be32 remote, __be32 local,
  98                                   __be32 key)
  99{
 100        unsigned int hash;
 101        struct ip_tunnel *t, *cand = NULL;
 102        struct hlist_head *head;
 103
 104        hash = ip_tunnel_hash(key, remote);
 105        head = &itn->tunnels[hash];
 106
 107        hlist_for_each_entry_rcu(t, head, hash_node) {
 108                if (local != t->parms.iph.saddr ||
 109                    remote != t->parms.iph.daddr ||
 110                    !(t->dev->flags & IFF_UP))
 111                        continue;
 112
 113                if (!ip_tunnel_key_match(&t->parms, flags, key))
 114                        continue;
 115
 116                if (t->parms.link == link)
 117                        return t;
 118                else
 119                        cand = t;
 120        }
 121
 122        hlist_for_each_entry_rcu(t, head, hash_node) {
 123                if (remote != t->parms.iph.daddr ||
 124                    t->parms.iph.saddr != 0 ||
 125                    !(t->dev->flags & IFF_UP))
 126                        continue;
 127
 128                if (!ip_tunnel_key_match(&t->parms, flags, key))
 129                        continue;
 130
 131                if (t->parms.link == link)
 132                        return t;
 133                else if (!cand)
 134                        cand = t;
 135        }
 136
 137        hash = ip_tunnel_hash(key, 0);
 138        head = &itn->tunnels[hash];
 139
 140        hlist_for_each_entry_rcu(t, head, hash_node) {
 141                if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
 142                    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
 143                        continue;
 144
 145                if (!(t->dev->flags & IFF_UP))
 146                        continue;
 147
 148                if (!ip_tunnel_key_match(&t->parms, flags, key))
 149                        continue;
 150
 151                if (t->parms.link == link)
 152                        return t;
 153                else if (!cand)
 154                        cand = t;
 155        }
 156
 157        if (flags & TUNNEL_NO_KEY)
 158                goto skip_key_lookup;
 159
 160        hlist_for_each_entry_rcu(t, head, hash_node) {
 161                if (t->parms.i_key != key ||
 162                    t->parms.iph.saddr != 0 ||
 163                    t->parms.iph.daddr != 0 ||
 164                    !(t->dev->flags & IFF_UP))
 165                        continue;
 166
 167                if (t->parms.link == link)
 168                        return t;
 169                else if (!cand)
 170                        cand = t;
 171        }
 172
 173skip_key_lookup:
 174        if (cand)
 175                return cand;
 176
 177        t = rcu_dereference(itn->collect_md_tun);
 178        if (t)
 179                return t;
 180
 181        if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
 182                return netdev_priv(itn->fb_tunnel_dev);
 183
 184        return NULL;
 185}
 186EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
 187
 188static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
 189                                    struct ip_tunnel_parm *parms)
 190{
 191        unsigned int h;
 192        __be32 remote;
 193        __be32 i_key = parms->i_key;
 194
 195        if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
 196                remote = parms->iph.daddr;
 197        else
 198                remote = 0;
 199
 200        if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
 201                i_key = 0;
 202
 203        h = ip_tunnel_hash(i_key, remote);
 204        return &itn->tunnels[h];
 205}
 206
 207static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 208{
 209        struct hlist_head *head = ip_bucket(itn, &t->parms);
 210
 211        if (t->collect_md)
 212                rcu_assign_pointer(itn->collect_md_tun, t);
 213        hlist_add_head_rcu(&t->hash_node, head);
 214}
 215
 216static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 217{
 218        if (t->collect_md)
 219                rcu_assign_pointer(itn->collect_md_tun, NULL);
 220        hlist_del_init_rcu(&t->hash_node);
 221}
 222
 223static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
 224                                        struct ip_tunnel_parm *parms,
 225                                        int type)
 226{
 227        __be32 remote = parms->iph.daddr;
 228        __be32 local = parms->iph.saddr;
 229        __be32 key = parms->i_key;
 230        int link = parms->link;
 231        struct ip_tunnel *t = NULL;
 232        struct hlist_head *head = ip_bucket(itn, parms);
 233
 234        hlist_for_each_entry_rcu(t, head, hash_node) {
 235                if (local == t->parms.iph.saddr &&
 236                    remote == t->parms.iph.daddr &&
 237                    key == t->parms.i_key &&
 238                    link == t->parms.link &&
 239                    type == t->dev->type)
 240                        break;
 241        }
 242        return t;
 243}
 244
 245static struct net_device *__ip_tunnel_create(struct net *net,
 246                                             const struct rtnl_link_ops *ops,
 247                                             struct ip_tunnel_parm *parms)
 248{
 249        int err;
 250        struct ip_tunnel *tunnel;
 251        struct net_device *dev;
 252        char name[IFNAMSIZ];
 253
 254        if (parms->name[0])
 255                strlcpy(name, parms->name, IFNAMSIZ);
 256        else {
 257                if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
 258                        err = -E2BIG;
 259                        goto failed;
 260                }
 261                strlcpy(name, ops->kind, IFNAMSIZ);
 262                strncat(name, "%d", 2);
 263        }
 264
 265        ASSERT_RTNL();
 266        dev = alloc_netdev(ops->priv_size, name, ops->setup);
 267        if (!dev) {
 268                err = -ENOMEM;
 269                goto failed;
 270        }
 271        dev_net_set(dev, net);
 272
 273        dev->rtnl_link_ops = ops;
 274
 275        tunnel = netdev_priv(dev);
 276        tunnel->parms = *parms;
 277        tunnel->net = net;
 278
 279        err = register_netdevice(dev);
 280        if (err)
 281                goto failed_free;
 282
 283        return dev;
 284
 285failed_free:
 286        free_netdev(dev);
 287failed:
 288        return ERR_PTR(err);
 289}
 290
 291static inline void init_tunnel_flow(struct flowi4 *fl4,
 292                                    int proto,
 293                                    __be32 daddr, __be32 saddr,
 294                                    __be32 key, __u8 tos, int oif)
 295{
 296        memset(fl4, 0, sizeof(*fl4));
 297        fl4->flowi4_oif = oif;
 298        fl4->daddr = daddr;
 299        fl4->saddr = saddr;
 300        fl4->flowi4_tos = tos;
 301        fl4->flowi4_proto = proto;
 302        fl4->fl4_gre_key = key;
 303}
 304
 305static int ip_tunnel_bind_dev(struct net_device *dev)
 306{
 307        struct net_device *tdev = NULL;
 308        struct ip_tunnel *tunnel = netdev_priv(dev);
 309        const struct iphdr *iph;
 310        int hlen = LL_MAX_HEADER;
 311        int mtu = ETH_DATA_LEN;
 312        int t_hlen = tunnel->hlen + sizeof(struct iphdr);
 313
 314        iph = &tunnel->parms.iph;
 315
 316        /* Guess output device to choose reasonable mtu and needed_headroom */
 317        if (iph->daddr) {
 318                struct flowi4 fl4;
 319                struct rtable *rt;
 320
 321                init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
 322                                 iph->saddr, tunnel->parms.o_key,
 323                                 RT_TOS(iph->tos), tunnel->parms.link);
 324                rt = ip_route_output_key(tunnel->net, &fl4);
 325
 326                if (!IS_ERR(rt)) {
 327                        tdev = rt->dst.dev;
 328                        ip_rt_put(rt);
 329                }
 330                if (dev->type != ARPHRD_ETHER)
 331                        dev->flags |= IFF_POINTOPOINT;
 332
 333                dst_cache_reset(&tunnel->dst_cache);
 334        }
 335
 336        if (!tdev && tunnel->parms.link)
 337                tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
 338
 339        if (tdev) {
 340                hlen = tdev->hard_header_len + tdev->needed_headroom;
 341                mtu = tdev->mtu;
 342        }
 343
 344        dev->needed_headroom = t_hlen + hlen;
 345        mtu -= (dev->hard_header_len + t_hlen);
 346
 347        if (mtu < 68)
 348                mtu = 68;
 349
 350        return mtu;
 351}
 352
 353static struct ip_tunnel *ip_tunnel_create(struct net *net,
 354                                          struct ip_tunnel_net *itn,
 355                                          struct ip_tunnel_parm *parms)
 356{
 357        struct ip_tunnel *nt, *fbt;
 358        struct net_device *dev;
 359
 360        BUG_ON(!itn->fb_tunnel_dev);
 361        fbt = netdev_priv(itn->fb_tunnel_dev);
 362        dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
 363        if (IS_ERR(dev))
 364                return ERR_CAST(dev);
 365
 366        dev->mtu = ip_tunnel_bind_dev(dev);
 367
 368        nt = netdev_priv(dev);
 369        ip_tunnel_add(itn, nt);
 370        return nt;
 371}
 372
 373int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 374                  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
 375                  bool log_ecn_error)
 376{
 377        struct pcpu_sw_netstats *tstats;
 378        const struct iphdr *iph = ip_hdr(skb);
 379        int err;
 380
 381#ifdef CONFIG_NET_IPGRE_BROADCAST
 382        if (ipv4_is_multicast(iph->daddr)) {
 383                tunnel->dev->stats.multicast++;
 384                skb->pkt_type = PACKET_BROADCAST;
 385        }
 386#endif
 387
 388        if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
 389             ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
 390                tunnel->dev->stats.rx_crc_errors++;
 391                tunnel->dev->stats.rx_errors++;
 392                goto drop;
 393        }
 394
 395        if (tunnel->parms.i_flags&TUNNEL_SEQ) {
 396                if (!(tpi->flags&TUNNEL_SEQ) ||
 397                    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
 398                        tunnel->dev->stats.rx_fifo_errors++;
 399                        tunnel->dev->stats.rx_errors++;
 400                        goto drop;
 401                }
 402                tunnel->i_seqno = ntohl(tpi->seq) + 1;
 403        }
 404
 405        skb_reset_network_header(skb);
 406
 407        err = IP_ECN_decapsulate(iph, skb);
 408        if (unlikely(err)) {
 409                if (log_ecn_error)
 410                        net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
 411                                        &iph->saddr, iph->tos);
 412                if (err > 1) {
 413                        ++tunnel->dev->stats.rx_frame_errors;
 414                        ++tunnel->dev->stats.rx_errors;
 415                        goto drop;
 416                }
 417        }
 418
 419        tstats = this_cpu_ptr(tunnel->dev->tstats);
 420        u64_stats_update_begin(&tstats->syncp);
 421        tstats->rx_packets++;
 422        tstats->rx_bytes += skb->len;
 423        u64_stats_update_end(&tstats->syncp);
 424
 425        skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
 426
 427        if (tunnel->dev->type == ARPHRD_ETHER) {
 428                skb->protocol = eth_type_trans(skb, tunnel->dev);
 429                skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 430        } else {
 431                skb->dev = tunnel->dev;
 432        }
 433
 434        if (tun_dst)
 435                skb_dst_set(skb, (struct dst_entry *)tun_dst);
 436
 437        gro_cells_receive(&tunnel->gro_cells, skb);
 438        return 0;
 439
 440drop:
 441        kfree_skb(skb);
 442        return 0;
 443}
 444EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
 445
 446static int ip_encap_hlen(struct ip_tunnel_encap *e)
 447{
 448        const struct ip_tunnel_encap_ops *ops;
 449        int hlen = -EINVAL;
 450
 451        if (e->type == TUNNEL_ENCAP_NONE)
 452                return 0;
 453
 454        if (e->type >= MAX_IPTUN_ENCAP_OPS)
 455                return -EINVAL;
 456
 457        rcu_read_lock();
 458        ops = rcu_dereference(iptun_encaps[e->type]);
 459        if (likely(ops && ops->encap_hlen))
 460                hlen = ops->encap_hlen(e);
 461        rcu_read_unlock();
 462
 463        return hlen;
 464}
 465
 466const struct ip_tunnel_encap_ops __rcu *
 467                iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
 468
 469int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
 470                            unsigned int num)
 471{
 472        if (num >= MAX_IPTUN_ENCAP_OPS)
 473                return -ERANGE;
 474
 475        return !cmpxchg((const struct ip_tunnel_encap_ops **)
 476                        &iptun_encaps[num],
 477                        NULL, ops) ? 0 : -1;
 478}
 479EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
 480
 481int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
 482                            unsigned int num)
 483{
 484        int ret;
 485
 486        if (num >= MAX_IPTUN_ENCAP_OPS)
 487                return -ERANGE;
 488
 489        ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
 490                       &iptun_encaps[num],
 491                       ops, NULL) == ops) ? 0 : -1;
 492
 493        synchronize_net();
 494
 495        return ret;
 496}
 497EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
 498
 499int ip_tunnel_encap_setup(struct ip_tunnel *t,
 500                          struct ip_tunnel_encap *ipencap)
 501{
 502        int hlen;
 503
 504        memset(&t->encap, 0, sizeof(t->encap));
 505
 506        hlen = ip_encap_hlen(ipencap);
 507        if (hlen < 0)
 508                return hlen;
 509
 510        t->encap.type = ipencap->type;
 511        t->encap.sport = ipencap->sport;
 512        t->encap.dport = ipencap->dport;
 513        t->encap.flags = ipencap->flags;
 514
 515        t->encap_hlen = hlen;
 516        t->hlen = t->encap_hlen + t->tun_hlen;
 517
 518        return 0;
 519}
 520EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
 521
 522int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
 523                    u8 *protocol, struct flowi4 *fl4)
 524{
 525        const struct ip_tunnel_encap_ops *ops;
 526        int ret = -EINVAL;
 527
 528        if (t->encap.type == TUNNEL_ENCAP_NONE)
 529                return 0;
 530
 531        if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
 532                return -EINVAL;
 533
 534        rcu_read_lock();
 535        ops = rcu_dereference(iptun_encaps[t->encap.type]);
 536        if (likely(ops && ops->build_header))
 537                ret = ops->build_header(skb, &t->encap, protocol, fl4);
 538        rcu_read_unlock();
 539
 540        return ret;
 541}
 542EXPORT_SYMBOL(ip_tunnel_encap);
 543
 544static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
 545                            struct rtable *rt, __be16 df,
 546                            const struct iphdr *inner_iph)
 547{
 548        struct ip_tunnel *tunnel = netdev_priv(dev);
 549        int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
 550        int mtu;
 551
 552        if (df)
 553                mtu = dst_mtu(&rt->dst) - dev->hard_header_len
 554                                        - sizeof(struct iphdr) - tunnel->hlen;
 555        else
 556                mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 557
 558        if (skb_dst(skb))
 559                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 560
 561        if (skb->protocol == htons(ETH_P_IP)) {
 562                if (!skb_is_gso(skb) &&
 563                    (inner_iph->frag_off & htons(IP_DF)) &&
 564                    mtu < pkt_size) {
 565                        memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
 566                        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 567                        return -E2BIG;
 568                }
 569        }
 570#if IS_ENABLED(CONFIG_IPV6)
 571        else if (skb->protocol == htons(ETH_P_IPV6)) {
 572                struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
 573
 574                if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
 575                           mtu >= IPV6_MIN_MTU) {
 576                        if ((tunnel->parms.iph.daddr &&
 577                            !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
 578                            rt6->rt6i_dst.plen == 128) {
 579                                rt6->rt6i_flags |= RTF_MODIFIED;
 580                                dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
 581                        }
 582                }
 583
 584                if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
 585                                        mtu < pkt_size) {
 586                        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 587                        return -E2BIG;
 588                }
 589        }
 590#endif
 591        return 0;
 592}
 593
 594void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 595                    const struct iphdr *tnl_params, u8 protocol)
 596{
 597        struct ip_tunnel *tunnel = netdev_priv(dev);
 598        const struct iphdr *inner_iph;
 599        struct flowi4 fl4;
 600        u8     tos, ttl;
 601        __be16 df;
 602        struct rtable *rt;              /* Route to the other host */
 603        unsigned int max_headroom;      /* The extra header space needed */
 604        __be32 dst;
 605        bool connected;
 606
 607        inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
 608        connected = (tunnel->parms.iph.daddr != 0);
 609
 610        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 611
 612        dst = tnl_params->daddr;
 613        if (dst == 0) {
 614                /* NBMA tunnel */
 615
 616                if (skb_dst(skb) == NULL) {
 617                        dev->stats.tx_fifo_errors++;
 618                        goto tx_error;
 619                }
 620
 621                if (skb->protocol == htons(ETH_P_IP)) {
 622                        rt = skb_rtable(skb);
 623                        dst = rt_nexthop(rt, inner_iph->daddr);
 624                }
 625#if IS_ENABLED(CONFIG_IPV6)
 626                else if (skb->protocol == htons(ETH_P_IPV6)) {
 627                        const struct in6_addr *addr6;
 628                        struct neighbour *neigh;
 629                        bool do_tx_error_icmp;
 630                        int addr_type;
 631
 632                        neigh = dst_neigh_lookup(skb_dst(skb),
 633                                                 &ipv6_hdr(skb)->daddr);
 634                        if (neigh == NULL)
 635                                goto tx_error;
 636
 637                        addr6 = (const struct in6_addr *)&neigh->primary_key;
 638                        addr_type = ipv6_addr_type(addr6);
 639
 640                        if (addr_type == IPV6_ADDR_ANY) {
 641                                addr6 = &ipv6_hdr(skb)->daddr;
 642                                addr_type = ipv6_addr_type(addr6);
 643                        }
 644
 645                        if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
 646                                do_tx_error_icmp = true;
 647                        else {
 648                                do_tx_error_icmp = false;
 649                                dst = addr6->s6_addr32[3];
 650                        }
 651                        neigh_release(neigh);
 652                        if (do_tx_error_icmp)
 653                                goto tx_error_icmp;
 654                }
 655#endif
 656                else
 657                        goto tx_error;
 658
 659                connected = false;
 660        }
 661
 662        tos = tnl_params->tos;
 663        if (tos & 0x1) {
 664                tos &= ~0x1;
 665                if (skb->protocol == htons(ETH_P_IP)) {
 666                        tos = inner_iph->tos;
 667                        connected = false;
 668                } else if (skb->protocol == htons(ETH_P_IPV6)) {
 669                        tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
 670                        connected = false;
 671                }
 672        }
 673
 674        init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
 675                         tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
 676
 677        if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
 678                goto tx_error;
 679
 680        rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
 681                         NULL;
 682
 683        if (!rt) {
 684                rt = ip_route_output_key(tunnel->net, &fl4);
 685
 686                if (IS_ERR(rt)) {
 687                        dev->stats.tx_carrier_errors++;
 688                        goto tx_error;
 689                }
 690                if (connected)
 691                        dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
 692                                          fl4.saddr);
 693        }
 694
 695        if (rt->dst.dev == dev) {
 696                ip_rt_put(rt);
 697                dev->stats.collisions++;
 698                goto tx_error;
 699        }
 700
 701        if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
 702                ip_rt_put(rt);
 703                goto tx_error;
 704        }
 705
 706        if (tunnel->err_count > 0) {
 707                if (time_before(jiffies,
 708                                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
 709                        tunnel->err_count--;
 710
 711                        dst_link_failure(skb);
 712                } else
 713                        tunnel->err_count = 0;
 714        }
 715
 716        tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
 717        ttl = tnl_params->ttl;
 718        if (ttl == 0) {
 719                if (skb->protocol == htons(ETH_P_IP))
 720                        ttl = inner_iph->ttl;
 721#if IS_ENABLED(CONFIG_IPV6)
 722                else if (skb->protocol == htons(ETH_P_IPV6))
 723                        ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
 724#endif
 725                else
 726                        ttl = ip4_dst_hoplimit(&rt->dst);
 727        }
 728
 729        df = tnl_params->frag_off;
 730        if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
 731                df |= (inner_iph->frag_off&htons(IP_DF));
 732
 733        max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
 734                        + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
 735        if (max_headroom > dev->needed_headroom)
 736                dev->needed_headroom = max_headroom;
 737
 738        if (skb_cow_head(skb, dev->needed_headroom)) {
 739                ip_rt_put(rt);
 740                dev->stats.tx_dropped++;
 741                kfree_skb(skb);
 742                return;
 743        }
 744
 745        iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
 746                      df, !net_eq(tunnel->net, dev_net(dev)));
 747        return;
 748
 749#if IS_ENABLED(CONFIG_IPV6)
 750tx_error_icmp:
 751        dst_link_failure(skb);
 752#endif
 753tx_error:
 754        dev->stats.tx_errors++;
 755        kfree_skb(skb);
 756}
 757EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
 758
 759static void ip_tunnel_update(struct ip_tunnel_net *itn,
 760                             struct ip_tunnel *t,
 761                             struct net_device *dev,
 762                             struct ip_tunnel_parm *p,
 763                             bool set_mtu)
 764{
 765        ip_tunnel_del(itn, t);
 766        t->parms.iph.saddr = p->iph.saddr;
 767        t->parms.iph.daddr = p->iph.daddr;
 768        t->parms.i_key = p->i_key;
 769        t->parms.o_key = p->o_key;
 770        if (dev->type != ARPHRD_ETHER) {
 771                memcpy(dev->dev_addr, &p->iph.saddr, 4);
 772                memcpy(dev->broadcast, &p->iph.daddr, 4);
 773        }
 774        ip_tunnel_add(itn, t);
 775
 776        t->parms.iph.ttl = p->iph.ttl;
 777        t->parms.iph.tos = p->iph.tos;
 778        t->parms.iph.frag_off = p->iph.frag_off;
 779
 780        if (t->parms.link != p->link) {
 781                int mtu;
 782
 783                t->parms.link = p->link;
 784                mtu = ip_tunnel_bind_dev(dev);
 785                if (set_mtu)
 786                        dev->mtu = mtu;
 787        }
 788        dst_cache_reset(&t->dst_cache);
 789        netdev_state_change(dev);
 790}
 791
 792int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
 793{
 794        int err = 0;
 795        struct ip_tunnel *t = netdev_priv(dev);
 796        struct net *net = t->net;
 797        struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
 798
 799        BUG_ON(!itn->fb_tunnel_dev);
 800        switch (cmd) {
 801        case SIOCGETTUNNEL:
 802                if (dev == itn->fb_tunnel_dev) {
 803                        t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 804                        if (t == NULL)
 805                                t = netdev_priv(dev);
 806                }
 807                memcpy(p, &t->parms, sizeof(*p));
 808                break;
 809
 810        case SIOCADDTUNNEL:
 811        case SIOCCHGTUNNEL:
 812                err = -EPERM;
 813                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 814                        goto done;
 815                if (p->iph.ttl)
 816                        p->iph.frag_off |= htons(IP_DF);
 817                if (!(p->i_flags & VTI_ISVTI)) {
 818                        if (!(p->i_flags & TUNNEL_KEY))
 819                                p->i_key = 0;
 820                        if (!(p->o_flags & TUNNEL_KEY))
 821                                p->o_key = 0;
 822                }
 823
 824                t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 825
 826                if (cmd == SIOCADDTUNNEL) {
 827                        if (!t) {
 828                                t = ip_tunnel_create(net, itn, p);
 829                                err = PTR_RET(t);
 830                                break;
 831                        }
 832
 833                        err = -EEXIST;
 834                        break;
 835                }
 836                if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
 837                        if (t != NULL) {
 838                                if (t->dev != dev) {
 839                                        err = -EEXIST;
 840                                        break;
 841                                }
 842                        } else {
 843                                unsigned int nflags = 0;
 844
 845                                if (ipv4_is_multicast(p->iph.daddr))
 846                                        nflags = IFF_BROADCAST;
 847                                else if (p->iph.daddr)
 848                                        nflags = IFF_POINTOPOINT;
 849
 850                                if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
 851                                        err = -EINVAL;
 852                                        break;
 853                                }
 854
 855                                t = netdev_priv(dev);
 856                        }
 857                }
 858
 859                if (t) {
 860                        err = 0;
 861                        ip_tunnel_update(itn, t, dev, p, true);
 862                } else {
 863                        err = -ENOENT;
 864                }
 865                break;
 866
 867        case SIOCDELTUNNEL:
 868                err = -EPERM;
 869                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 870                        goto done;
 871
 872                if (dev == itn->fb_tunnel_dev) {
 873                        err = -ENOENT;
 874                        t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 875                        if (t == NULL)
 876                                goto done;
 877                        err = -EPERM;
 878                        if (t == netdev_priv(itn->fb_tunnel_dev))
 879                                goto done;
 880                        dev = t->dev;
 881                }
 882                unregister_netdevice(dev);
 883                err = 0;
 884                break;
 885
 886        default:
 887                err = -EINVAL;
 888        }
 889
 890done:
 891        return err;
 892}
 893EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
 894
 895int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
 896{
 897        struct ip_tunnel *tunnel = netdev_priv(dev);
 898        int t_hlen = tunnel->hlen + sizeof(struct iphdr);
 899        int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
 900
 901        if (new_mtu < 68)
 902                return -EINVAL;
 903
 904        if (new_mtu > max_mtu) {
 905                if (strict)
 906                        return -EINVAL;
 907
 908                new_mtu = max_mtu;
 909        }
 910
 911        dev->mtu = new_mtu;
 912        return 0;
 913}
 914EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
 915
 916int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
 917{
 918        return __ip_tunnel_change_mtu(dev, new_mtu, true);
 919}
 920EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
 921
 922static void ip_tunnel_dev_free(struct net_device *dev)
 923{
 924        struct ip_tunnel *tunnel = netdev_priv(dev);
 925
 926        gro_cells_destroy(&tunnel->gro_cells);
 927        dst_cache_destroy(&tunnel->dst_cache);
 928        free_percpu(dev->tstats);
 929        free_netdev(dev);
 930}
 931
 932void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
 933{
 934        struct ip_tunnel *tunnel = netdev_priv(dev);
 935        struct ip_tunnel_net *itn;
 936
 937        itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
 938
 939        if (itn->fb_tunnel_dev != dev) {
 940                ip_tunnel_del(itn, netdev_priv(dev));
 941                unregister_netdevice_queue(dev, head);
 942        }
 943}
 944EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
 945
 946struct net *ip_tunnel_get_link_net(const struct net_device *dev)
 947{
 948        struct ip_tunnel *tunnel = netdev_priv(dev);
 949
 950        return tunnel->net;
 951}
 952EXPORT_SYMBOL(ip_tunnel_get_link_net);
 953
 954int ip_tunnel_get_iflink(const struct net_device *dev)
 955{
 956        struct ip_tunnel *tunnel = netdev_priv(dev);
 957
 958        return tunnel->parms.link;
 959}
 960EXPORT_SYMBOL(ip_tunnel_get_iflink);
 961
 962int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
 963                                  struct rtnl_link_ops *ops, char *devname)
 964{
 965        struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
 966        struct ip_tunnel_parm parms;
 967        unsigned int i;
 968
 969        for (i = 0; i < IP_TNL_HASH_SIZE; i++)
 970                INIT_HLIST_HEAD(&itn->tunnels[i]);
 971
 972        if (!ops) {
 973                itn->fb_tunnel_dev = NULL;
 974                return 0;
 975        }
 976
 977        memset(&parms, 0, sizeof(parms));
 978        if (devname)
 979                strlcpy(parms.name, devname, IFNAMSIZ);
 980
 981        rtnl_lock();
 982        itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
 983        /* FB netdevice is special: we have one, and only one per netns.
 984         * Allowing to move it to another netns is clearly unsafe.
 985         */
 986        if (!IS_ERR(itn->fb_tunnel_dev)) {
 987                itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
 988                itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
 989                ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
 990        }
 991        rtnl_unlock();
 992
 993        if (IS_ERR(itn->fb_tunnel_dev))
 994                return PTR_ERR(itn->fb_tunnel_dev);
 995
 996        return 0;
 997}
 998EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
 999
1000static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1001                              struct rtnl_link_ops *ops)
1002{
1003        struct net *net = dev_net(itn->fb_tunnel_dev);
1004        struct net_device *dev, *aux;
1005        int h;
1006
1007        for_each_netdev_safe(net, dev, aux)
1008                if (dev->rtnl_link_ops == ops)
1009                        unregister_netdevice_queue(dev, head);
1010
1011        for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1012                struct ip_tunnel *t;
1013                struct hlist_node *n;
1014                struct hlist_head *thead = &itn->tunnels[h];
1015
1016                hlist_for_each_entry_safe(t, n, thead, hash_node)
1017                        /* If dev is in the same netns, it has already
1018                         * been added to the list by the previous loop.
1019                         */
1020                        if (!net_eq(dev_net(t->dev), net))
1021                                unregister_netdevice_queue(t->dev, head);
1022        }
1023}
1024
1025void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1026{
1027        LIST_HEAD(list);
1028
1029        rtnl_lock();
1030        ip_tunnel_destroy(itn, &list, ops);
1031        unregister_netdevice_many(&list);
1032        rtnl_unlock();
1033}
1034EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1035
1036int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1037                      struct ip_tunnel_parm *p)
1038{
1039        struct ip_tunnel *nt;
1040        struct net *net = dev_net(dev);
1041        struct ip_tunnel_net *itn;
1042        int mtu;
1043        int err;
1044
1045        nt = netdev_priv(dev);
1046        itn = net_generic(net, nt->ip_tnl_net_id);
1047
1048        if (nt->collect_md) {
1049                if (rtnl_dereference(itn->collect_md_tun))
1050                        return -EEXIST;
1051        } else {
1052                if (ip_tunnel_find(itn, p, dev->type))
1053                        return -EEXIST;
1054        }
1055
1056        nt->net = net;
1057        nt->parms = *p;
1058        err = register_netdevice(dev);
1059        if (err)
1060                goto out;
1061
1062        if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1063                eth_hw_addr_random(dev);
1064
1065        mtu = ip_tunnel_bind_dev(dev);
1066        if (!tb[IFLA_MTU])
1067                dev->mtu = mtu;
1068
1069        ip_tunnel_add(itn, nt);
1070out:
1071        return err;
1072}
1073EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1074
1075int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1076                         struct ip_tunnel_parm *p)
1077{
1078        struct ip_tunnel *t;
1079        struct ip_tunnel *tunnel = netdev_priv(dev);
1080        struct net *net = tunnel->net;
1081        struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1082
1083        if (dev == itn->fb_tunnel_dev)
1084                return -EINVAL;
1085
1086        t = ip_tunnel_find(itn, p, dev->type);
1087
1088        if (t) {
1089                if (t->dev != dev)
1090                        return -EEXIST;
1091        } else {
1092                t = tunnel;
1093
1094                if (dev->type != ARPHRD_ETHER) {
1095                        unsigned int nflags = 0;
1096
1097                        if (ipv4_is_multicast(p->iph.daddr))
1098                                nflags = IFF_BROADCAST;
1099                        else if (p->iph.daddr)
1100                                nflags = IFF_POINTOPOINT;
1101
1102                        if ((dev->flags ^ nflags) &
1103                            (IFF_POINTOPOINT | IFF_BROADCAST))
1104                                return -EINVAL;
1105                }
1106        }
1107
1108        ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1109        return 0;
1110}
1111EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1112
1113int ip_tunnel_init(struct net_device *dev)
1114{
1115        struct ip_tunnel *tunnel = netdev_priv(dev);
1116        struct iphdr *iph = &tunnel->parms.iph;
1117        int err;
1118
1119        dev->destructor = ip_tunnel_dev_free;
1120        dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1121        if (!dev->tstats)
1122                return -ENOMEM;
1123
1124        err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1125        if (err) {
1126                free_percpu(dev->tstats);
1127                return err;
1128        }
1129
1130        err = gro_cells_init(&tunnel->gro_cells, dev);
1131        if (err) {
1132                dst_cache_destroy(&tunnel->dst_cache);
1133                free_percpu(dev->tstats);
1134                return err;
1135        }
1136
1137        tunnel->dev = dev;
1138        tunnel->net = dev_net(dev);
1139        strcpy(tunnel->parms.name, dev->name);
1140        iph->version            = 4;
1141        iph->ihl                = 5;
1142
1143        if (tunnel->collect_md) {
1144                dev->features |= NETIF_F_NETNS_LOCAL;
1145                netif_keep_dst(dev);
1146        }
1147        return 0;
1148}
1149EXPORT_SYMBOL_GPL(ip_tunnel_init);
1150
1151void ip_tunnel_uninit(struct net_device *dev)
1152{
1153        struct ip_tunnel *tunnel = netdev_priv(dev);
1154        struct net *net = tunnel->net;
1155        struct ip_tunnel_net *itn;
1156
1157        itn = net_generic(net, tunnel->ip_tnl_net_id);
1158        /* fb_tunnel_dev will be unregisted in net-exit call. */
1159        if (itn->fb_tunnel_dev != dev)
1160                ip_tunnel_del(itn, netdev_priv(dev));
1161
1162        dst_cache_reset(&tunnel->dst_cache);
1163}
1164EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1165
1166/* Do least required initialization, rest of init is done in tunnel_init call */
1167void ip_tunnel_setup(struct net_device *dev, int net_id)
1168{
1169        struct ip_tunnel *tunnel = netdev_priv(dev);
1170        tunnel->ip_tnl_net_id = net_id;
1171}
1172EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1173
1174MODULE_LICENSE("GPL");
1175