linux/net/ipv4/ip_tunnel.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2013 Nicira, Inc.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of version 2 of the GNU General Public
   6 * License as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful, but
   9 * WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public License
  14 * along with this program; if not, write to the Free Software
  15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16 * 02110-1301, USA
  17 */
  18
  19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  20
  21#include <linux/capability.h>
  22#include <linux/module.h>
  23#include <linux/types.h>
  24#include <linux/kernel.h>
  25#include <linux/slab.h>
  26#include <linux/uaccess.h>
  27#include <linux/skbuff.h>
  28#include <linux/netdevice.h>
  29#include <linux/in.h>
  30#include <linux/tcp.h>
  31#include <linux/udp.h>
  32#include <linux/if_arp.h>
  33#include <linux/mroute.h>
  34#include <linux/init.h>
  35#include <linux/in6.h>
  36#include <linux/inetdevice.h>
  37#include <linux/igmp.h>
  38#include <linux/netfilter_ipv4.h>
  39#include <linux/etherdevice.h>
  40#include <linux/if_ether.h>
  41#include <linux/if_vlan.h>
  42#include <linux/rculist.h>
  43#include <linux/err.h>
  44
  45#include <net/sock.h>
  46#include <net/ip.h>
  47#include <net/icmp.h>
  48#include <net/protocol.h>
  49#include <net/ip_tunnels.h>
  50#include <net/arp.h>
  51#include <net/checksum.h>
  52#include <net/dsfield.h>
  53#include <net/inet_ecn.h>
  54#include <net/xfrm.h>
  55#include <net/net_namespace.h>
  56#include <net/netns/generic.h>
  57#include <net/rtnetlink.h>
  58#include <net/udp.h>
  59
  60#if IS_ENABLED(CONFIG_IPV6)
  61#include <net/ipv6.h>
  62#include <net/ip6_fib.h>
  63#include <net/ip6_route.h>
  64#endif
  65
  66static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
  67{
  68        return hash_32((__force u32)key ^ (__force u32)remote,
  69                         IP_TNL_HASH_BITS);
  70}
  71
  72static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
  73                             struct dst_entry *dst, __be32 saddr)
  74{
  75        struct dst_entry *old_dst;
  76
  77        dst_clone(dst);
  78        old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
  79        dst_release(old_dst);
  80        idst->saddr = saddr;
  81}
  82
  83static noinline void tunnel_dst_set(struct ip_tunnel *t,
  84                           struct dst_entry *dst, __be32 saddr)
  85{
  86        __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
  87}
  88
  89static void tunnel_dst_reset(struct ip_tunnel *t)
  90{
  91        tunnel_dst_set(t, NULL, 0);
  92}
  93
  94void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
  95{
  96        int i;
  97
  98        for_each_possible_cpu(i)
  99                __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
 100}
 101EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
 102
 103static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
 104                                        u32 cookie, __be32 *saddr)
 105{
 106        struct ip_tunnel_dst *idst;
 107        struct dst_entry *dst;
 108
 109        rcu_read_lock();
 110        idst = raw_cpu_ptr(t->dst_cache);
 111        dst = rcu_dereference(idst->dst);
 112        if (dst && !atomic_inc_not_zero(&dst->__refcnt))
 113                dst = NULL;
 114        if (dst) {
 115                if (!dst->obsolete || dst->ops->check(dst, cookie)) {
 116                        *saddr = idst->saddr;
 117                } else {
 118                        tunnel_dst_reset(t);
 119                        dst_release(dst);
 120                        dst = NULL;
 121                }
 122        }
 123        rcu_read_unlock();
 124        return (struct rtable *)dst;
 125}
 126
 127static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
 128                                __be16 flags, __be32 key)
 129{
 130        if (p->i_flags & TUNNEL_KEY) {
 131                if (flags & TUNNEL_KEY)
 132                        return key == p->i_key;
 133                else
 134                        /* key expected, none present */
 135                        return false;
 136        } else
 137                return !(flags & TUNNEL_KEY);
 138}
 139
 140/* Fallback tunnel: no source, no destination, no key, no options
 141
 142   Tunnel hash table:
 143   We require exact key match i.e. if a key is present in packet
 144   it will match only tunnel with the same key; if it is not present,
 145   it will match only keyless tunnel.
 146
 147   All keysless packets, if not matched configured keyless tunnels
 148   will match fallback tunnel.
 149   Given src, dst and key, find appropriate for input tunnel.
 150*/
 151struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
 152                                   int link, __be16 flags,
 153                                   __be32 remote, __be32 local,
 154                                   __be32 key)
 155{
 156        unsigned int hash;
 157        struct ip_tunnel *t, *cand = NULL;
 158        struct hlist_head *head;
 159
 160        hash = ip_tunnel_hash(key, remote);
 161        head = &itn->tunnels[hash];
 162
 163        hlist_for_each_entry_rcu(t, head, hash_node) {
 164                if (local != t->parms.iph.saddr ||
 165                    remote != t->parms.iph.daddr ||
 166                    !(t->dev->flags & IFF_UP))
 167                        continue;
 168
 169                if (!ip_tunnel_key_match(&t->parms, flags, key))
 170                        continue;
 171
 172                if (t->parms.link == link)
 173                        return t;
 174                else
 175                        cand = t;
 176        }
 177
 178        hlist_for_each_entry_rcu(t, head, hash_node) {
 179                if (remote != t->parms.iph.daddr ||
 180                    t->parms.iph.saddr != 0 ||
 181                    !(t->dev->flags & IFF_UP))
 182                        continue;
 183
 184                if (!ip_tunnel_key_match(&t->parms, flags, key))
 185                        continue;
 186
 187                if (t->parms.link == link)
 188                        return t;
 189                else if (!cand)
 190                        cand = t;
 191        }
 192
 193        hash = ip_tunnel_hash(key, 0);
 194        head = &itn->tunnels[hash];
 195
 196        hlist_for_each_entry_rcu(t, head, hash_node) {
 197                if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
 198                    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
 199                        continue;
 200
 201                if (!(t->dev->flags & IFF_UP))
 202                        continue;
 203
 204                if (!ip_tunnel_key_match(&t->parms, flags, key))
 205                        continue;
 206
 207                if (t->parms.link == link)
 208                        return t;
 209                else if (!cand)
 210                        cand = t;
 211        }
 212
 213        if (flags & TUNNEL_NO_KEY)
 214                goto skip_key_lookup;
 215
 216        hlist_for_each_entry_rcu(t, head, hash_node) {
 217                if (t->parms.i_key != key ||
 218                    t->parms.iph.saddr != 0 ||
 219                    t->parms.iph.daddr != 0 ||
 220                    !(t->dev->flags & IFF_UP))
 221                        continue;
 222
 223                if (t->parms.link == link)
 224                        return t;
 225                else if (!cand)
 226                        cand = t;
 227        }
 228
 229skip_key_lookup:
 230        if (cand)
 231                return cand;
 232
 233        if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
 234                return netdev_priv(itn->fb_tunnel_dev);
 235
 236
 237        return NULL;
 238}
 239EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
 240
 241static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
 242                                    struct ip_tunnel_parm *parms)
 243{
 244        unsigned int h;
 245        __be32 remote;
 246        __be32 i_key = parms->i_key;
 247
 248        if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
 249                remote = parms->iph.daddr;
 250        else
 251                remote = 0;
 252
 253        if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
 254                i_key = 0;
 255
 256        h = ip_tunnel_hash(i_key, remote);
 257        return &itn->tunnels[h];
 258}
 259
 260static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 261{
 262        struct hlist_head *head = ip_bucket(itn, &t->parms);
 263
 264        hlist_add_head_rcu(&t->hash_node, head);
 265}
 266
 267static void ip_tunnel_del(struct ip_tunnel *t)
 268{
 269        hlist_del_init_rcu(&t->hash_node);
 270}
 271
 272static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
 273                                        struct ip_tunnel_parm *parms,
 274                                        int type)
 275{
 276        __be32 remote = parms->iph.daddr;
 277        __be32 local = parms->iph.saddr;
 278        __be32 key = parms->i_key;
 279        __be16 flags = parms->i_flags;
 280        int link = parms->link;
 281        struct ip_tunnel *t = NULL;
 282        struct hlist_head *head = ip_bucket(itn, parms);
 283
 284        hlist_for_each_entry_rcu(t, head, hash_node) {
 285                if (local == t->parms.iph.saddr &&
 286                    remote == t->parms.iph.daddr &&
 287                    link == t->parms.link &&
 288                    type == t->dev->type &&
 289                    ip_tunnel_key_match(&t->parms, flags, key))
 290                        break;
 291        }
 292        return t;
 293}
 294
 295static struct net_device *__ip_tunnel_create(struct net *net,
 296                                             const struct rtnl_link_ops *ops,
 297                                             struct ip_tunnel_parm *parms)
 298{
 299        int err;
 300        struct ip_tunnel *tunnel;
 301        struct net_device *dev;
 302        char name[IFNAMSIZ];
 303
 304        if (parms->name[0])
 305                strlcpy(name, parms->name, IFNAMSIZ);
 306        else {
 307                if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
 308                        err = -E2BIG;
 309                        goto failed;
 310                }
 311                strlcpy(name, ops->kind, IFNAMSIZ);
 312                strncat(name, "%d", 2);
 313        }
 314
 315        ASSERT_RTNL();
 316        dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
 317        if (!dev) {
 318                err = -ENOMEM;
 319                goto failed;
 320        }
 321        dev_net_set(dev, net);
 322
 323        dev->rtnl_link_ops = ops;
 324
 325        tunnel = netdev_priv(dev);
 326        tunnel->parms = *parms;
 327        tunnel->net = net;
 328
 329        err = register_netdevice(dev);
 330        if (err)
 331                goto failed_free;
 332
 333        return dev;
 334
 335failed_free:
 336        free_netdev(dev);
 337failed:
 338        return ERR_PTR(err);
 339}
 340
 341static inline void init_tunnel_flow(struct flowi4 *fl4,
 342                                    int proto,
 343                                    __be32 daddr, __be32 saddr,
 344                                    __be32 key, __u8 tos, int oif)
 345{
 346        memset(fl4, 0, sizeof(*fl4));
 347        fl4->flowi4_oif = oif;
 348        fl4->daddr = daddr;
 349        fl4->saddr = saddr;
 350        fl4->flowi4_tos = tos;
 351        fl4->flowi4_proto = proto;
 352        fl4->fl4_gre_key = key;
 353}
 354
 355static int ip_tunnel_bind_dev(struct net_device *dev)
 356{
 357        struct net_device *tdev = NULL;
 358        struct ip_tunnel *tunnel = netdev_priv(dev);
 359        const struct iphdr *iph;
 360        int hlen = LL_MAX_HEADER;
 361        int mtu = ETH_DATA_LEN;
 362        int t_hlen = tunnel->hlen + sizeof(struct iphdr);
 363
 364        iph = &tunnel->parms.iph;
 365
 366        /* Guess output device to choose reasonable mtu and needed_headroom */
 367        if (iph->daddr) {
 368                struct flowi4 fl4;
 369                struct rtable *rt;
 370
 371                init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
 372                                 iph->saddr, tunnel->parms.o_key,
 373                                 RT_TOS(iph->tos), tunnel->parms.link);
 374                rt = ip_route_output_key(tunnel->net, &fl4);
 375
 376                if (!IS_ERR(rt)) {
 377                        tdev = rt->dst.dev;
 378                        tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
 379                        ip_rt_put(rt);
 380                }
 381                if (dev->type != ARPHRD_ETHER)
 382                        dev->flags |= IFF_POINTOPOINT;
 383        }
 384
 385        if (!tdev && tunnel->parms.link)
 386                tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
 387
 388        if (tdev) {
 389                hlen = tdev->hard_header_len + tdev->needed_headroom;
 390                mtu = tdev->mtu;
 391        }
 392
 393        dev->needed_headroom = t_hlen + hlen;
 394        mtu -= (dev->hard_header_len + t_hlen);
 395
 396        if (mtu < 68)
 397                mtu = 68;
 398
 399        return mtu;
 400}
 401
 402static struct ip_tunnel *ip_tunnel_create(struct net *net,
 403                                          struct ip_tunnel_net *itn,
 404                                          struct ip_tunnel_parm *parms)
 405{
 406        struct ip_tunnel *nt;
 407        struct net_device *dev;
 408
 409        BUG_ON(!itn->fb_tunnel_dev);
 410        dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
 411        if (IS_ERR(dev))
 412                return ERR_CAST(dev);
 413
 414        dev->mtu = ip_tunnel_bind_dev(dev);
 415
 416        nt = netdev_priv(dev);
 417        ip_tunnel_add(itn, nt);
 418        return nt;
 419}
 420
 421int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 422                  const struct tnl_ptk_info *tpi, bool log_ecn_error)
 423{
 424        struct pcpu_sw_netstats *tstats;
 425        const struct iphdr *iph = ip_hdr(skb);
 426        int err;
 427
 428#ifdef CONFIG_NET_IPGRE_BROADCAST
 429        if (ipv4_is_multicast(iph->daddr)) {
 430                tunnel->dev->stats.multicast++;
 431                skb->pkt_type = PACKET_BROADCAST;
 432        }
 433#endif
 434
 435        if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
 436             ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
 437                tunnel->dev->stats.rx_crc_errors++;
 438                tunnel->dev->stats.rx_errors++;
 439                goto drop;
 440        }
 441
 442        if (tunnel->parms.i_flags&TUNNEL_SEQ) {
 443                if (!(tpi->flags&TUNNEL_SEQ) ||
 444                    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
 445                        tunnel->dev->stats.rx_fifo_errors++;
 446                        tunnel->dev->stats.rx_errors++;
 447                        goto drop;
 448                }
 449                tunnel->i_seqno = ntohl(tpi->seq) + 1;
 450        }
 451
 452        skb_reset_network_header(skb);
 453
 454        err = IP_ECN_decapsulate(iph, skb);
 455        if (unlikely(err)) {
 456                if (log_ecn_error)
 457                        net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
 458                                        &iph->saddr, iph->tos);
 459                if (err > 1) {
 460                        ++tunnel->dev->stats.rx_frame_errors;
 461                        ++tunnel->dev->stats.rx_errors;
 462                        goto drop;
 463                }
 464        }
 465
 466        tstats = this_cpu_ptr(tunnel->dev->tstats);
 467        u64_stats_update_begin(&tstats->syncp);
 468        tstats->rx_packets++;
 469        tstats->rx_bytes += skb->len;
 470        u64_stats_update_end(&tstats->syncp);
 471
 472        skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
 473
 474        if (tunnel->dev->type == ARPHRD_ETHER) {
 475                skb->protocol = eth_type_trans(skb, tunnel->dev);
 476                skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 477        } else {
 478                skb->dev = tunnel->dev;
 479        }
 480
 481        gro_cells_receive(&tunnel->gro_cells, skb);
 482        return 0;
 483
 484drop:
 485        kfree_skb(skb);
 486        return 0;
 487}
 488EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
 489
 490static int ip_encap_hlen(struct ip_tunnel_encap *e)
 491{
 492        const struct ip_tunnel_encap_ops *ops;
 493        int hlen = -EINVAL;
 494
 495        if (e->type == TUNNEL_ENCAP_NONE)
 496                return 0;
 497
 498        if (e->type >= MAX_IPTUN_ENCAP_OPS)
 499                return -EINVAL;
 500
 501        rcu_read_lock();
 502        ops = rcu_dereference(iptun_encaps[e->type]);
 503        if (likely(ops && ops->encap_hlen))
 504                hlen = ops->encap_hlen(e);
 505        rcu_read_unlock();
 506
 507        return hlen;
 508}
 509
 510const struct ip_tunnel_encap_ops __rcu *
 511                iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
 512
 513int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
 514                            unsigned int num)
 515{
 516        if (num >= MAX_IPTUN_ENCAP_OPS)
 517                return -ERANGE;
 518
 519        return !cmpxchg((const struct ip_tunnel_encap_ops **)
 520                        &iptun_encaps[num],
 521                        NULL, ops) ? 0 : -1;
 522}
 523EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
 524
 525int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
 526                            unsigned int num)
 527{
 528        int ret;
 529
 530        if (num >= MAX_IPTUN_ENCAP_OPS)
 531                return -ERANGE;
 532
 533        ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
 534                       &iptun_encaps[num],
 535                       ops, NULL) == ops) ? 0 : -1;
 536
 537        synchronize_net();
 538
 539        return ret;
 540}
 541EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
 542
 543int ip_tunnel_encap_setup(struct ip_tunnel *t,
 544                          struct ip_tunnel_encap *ipencap)
 545{
 546        int hlen;
 547
 548        memset(&t->encap, 0, sizeof(t->encap));
 549
 550        hlen = ip_encap_hlen(ipencap);
 551        if (hlen < 0)
 552                return hlen;
 553
 554        t->encap.type = ipencap->type;
 555        t->encap.sport = ipencap->sport;
 556        t->encap.dport = ipencap->dport;
 557        t->encap.flags = ipencap->flags;
 558
 559        t->encap_hlen = hlen;
 560        t->hlen = t->encap_hlen + t->tun_hlen;
 561
 562        return 0;
 563}
 564EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
 565
 566int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
 567                    u8 *protocol, struct flowi4 *fl4)
 568{
 569        const struct ip_tunnel_encap_ops *ops;
 570        int ret = -EINVAL;
 571
 572        if (t->encap.type == TUNNEL_ENCAP_NONE)
 573                return 0;
 574
 575        if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
 576                return -EINVAL;
 577
 578        rcu_read_lock();
 579        ops = rcu_dereference(iptun_encaps[t->encap.type]);
 580        if (likely(ops && ops->build_header))
 581                ret = ops->build_header(skb, &t->encap, protocol, fl4);
 582        rcu_read_unlock();
 583
 584        return ret;
 585}
 586EXPORT_SYMBOL(ip_tunnel_encap);
 587
 588static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
 589                            struct rtable *rt, __be16 df)
 590{
 591        struct ip_tunnel *tunnel = netdev_priv(dev);
 592        int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
 593        int mtu;
 594
 595        if (df)
 596                mtu = dst_mtu(&rt->dst) - dev->hard_header_len
 597                                        - sizeof(struct iphdr) - tunnel->hlen;
 598        else
 599                mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 600
 601        if (skb_dst(skb))
 602                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 603
 604        if (skb->protocol == htons(ETH_P_IP)) {
 605                if (!skb_is_gso(skb) &&
 606                    (df & htons(IP_DF)) && mtu < pkt_size) {
 607                        memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
 608                        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 609                        return -E2BIG;
 610                }
 611        }
 612#if IS_ENABLED(CONFIG_IPV6)
 613        else if (skb->protocol == htons(ETH_P_IPV6)) {
 614                struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
 615
 616                if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
 617                           mtu >= IPV6_MIN_MTU) {
 618                        if ((tunnel->parms.iph.daddr &&
 619                            !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
 620                            rt6->rt6i_dst.plen == 128) {
 621                                rt6->rt6i_flags |= RTF_MODIFIED;
 622                                dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
 623                        }
 624                }
 625
 626                if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
 627                                        mtu < pkt_size) {
 628                        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 629                        return -E2BIG;
 630                }
 631        }
 632#endif
 633        return 0;
 634}
 635
 636void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 637                    const struct iphdr *tnl_params, u8 protocol)
 638{
 639        struct ip_tunnel *tunnel = netdev_priv(dev);
 640        const struct iphdr *inner_iph;
 641        struct flowi4 fl4;
 642        u8     tos, ttl;
 643        __be16 df;
 644        struct rtable *rt;              /* Route to the other host */
 645        unsigned int max_headroom;      /* The extra header space needed */
 646        __be32 dst;
 647        int err;
 648        bool connected;
 649
 650        inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
 651        connected = (tunnel->parms.iph.daddr != 0);
 652
 653        dst = tnl_params->daddr;
 654        if (dst == 0) {
 655                /* NBMA tunnel */
 656
 657                if (!skb_dst(skb)) {
 658                        dev->stats.tx_fifo_errors++;
 659                        goto tx_error;
 660                }
 661
 662                if (skb->protocol == htons(ETH_P_IP)) {
 663                        rt = skb_rtable(skb);
 664                        dst = rt_nexthop(rt, inner_iph->daddr);
 665                }
 666#if IS_ENABLED(CONFIG_IPV6)
 667                else if (skb->protocol == htons(ETH_P_IPV6)) {
 668                        const struct in6_addr *addr6;
 669                        struct neighbour *neigh;
 670                        bool do_tx_error_icmp;
 671                        int addr_type;
 672
 673                        neigh = dst_neigh_lookup(skb_dst(skb),
 674                                                 &ipv6_hdr(skb)->daddr);
 675                        if (!neigh)
 676                                goto tx_error;
 677
 678                        addr6 = (const struct in6_addr *)&neigh->primary_key;
 679                        addr_type = ipv6_addr_type(addr6);
 680
 681                        if (addr_type == IPV6_ADDR_ANY) {
 682                                addr6 = &ipv6_hdr(skb)->daddr;
 683                                addr_type = ipv6_addr_type(addr6);
 684                        }
 685
 686                        if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
 687                                do_tx_error_icmp = true;
 688                        else {
 689                                do_tx_error_icmp = false;
 690                                dst = addr6->s6_addr32[3];
 691                        }
 692                        neigh_release(neigh);
 693                        if (do_tx_error_icmp)
 694                                goto tx_error_icmp;
 695                }
 696#endif
 697                else
 698                        goto tx_error;
 699
 700                connected = false;
 701        }
 702
 703        tos = tnl_params->tos;
 704        if (tos & 0x1) {
 705                tos &= ~0x1;
 706                if (skb->protocol == htons(ETH_P_IP)) {
 707                        tos = inner_iph->tos;
 708                        connected = false;
 709                } else if (skb->protocol == htons(ETH_P_IPV6)) {
 710                        tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
 711                        connected = false;
 712                }
 713        }
 714
 715        init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
 716                         tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
 717
 718        if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
 719                goto tx_error;
 720
 721        rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
 722
 723        if (!rt) {
 724                rt = ip_route_output_key(tunnel->net, &fl4);
 725
 726                if (IS_ERR(rt)) {
 727                        dev->stats.tx_carrier_errors++;
 728                        goto tx_error;
 729                }
 730                if (connected)
 731                        tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
 732        }
 733
 734        if (rt->dst.dev == dev) {
 735                ip_rt_put(rt);
 736                dev->stats.collisions++;
 737                goto tx_error;
 738        }
 739
 740        if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
 741                ip_rt_put(rt);
 742                goto tx_error;
 743        }
 744
 745        if (tunnel->err_count > 0) {
 746                if (time_before(jiffies,
 747                                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
 748                        tunnel->err_count--;
 749
 750                        memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
 751                        dst_link_failure(skb);
 752                } else
 753                        tunnel->err_count = 0;
 754        }
 755
 756        tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
 757        ttl = tnl_params->ttl;
 758        if (ttl == 0) {
 759                if (skb->protocol == htons(ETH_P_IP))
 760                        ttl = inner_iph->ttl;
 761#if IS_ENABLED(CONFIG_IPV6)
 762                else if (skb->protocol == htons(ETH_P_IPV6))
 763                        ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
 764#endif
 765                else
 766                        ttl = ip4_dst_hoplimit(&rt->dst);
 767        }
 768
 769        df = tnl_params->frag_off;
 770        if (skb->protocol == htons(ETH_P_IP))
 771                df |= (inner_iph->frag_off&htons(IP_DF));
 772
 773        max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
 774                        + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
 775        if (max_headroom > dev->needed_headroom)
 776                dev->needed_headroom = max_headroom;
 777
 778        if (skb_cow_head(skb, dev->needed_headroom)) {
 779                ip_rt_put(rt);
 780                dev->stats.tx_dropped++;
 781                kfree_skb(skb);
 782                return;
 783        }
 784
 785        err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol,
 786                            tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
 787        iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
 788
 789        return;
 790
 791#if IS_ENABLED(CONFIG_IPV6)
 792tx_error_icmp:
 793        dst_link_failure(skb);
 794#endif
 795tx_error:
 796        dev->stats.tx_errors++;
 797        kfree_skb(skb);
 798}
 799EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
 800
 801static void ip_tunnel_update(struct ip_tunnel_net *itn,
 802                             struct ip_tunnel *t,
 803                             struct net_device *dev,
 804                             struct ip_tunnel_parm *p,
 805                             bool set_mtu)
 806{
 807        ip_tunnel_del(t);
 808        t->parms.iph.saddr = p->iph.saddr;
 809        t->parms.iph.daddr = p->iph.daddr;
 810        t->parms.i_key = p->i_key;
 811        t->parms.o_key = p->o_key;
 812        if (dev->type != ARPHRD_ETHER) {
 813                memcpy(dev->dev_addr, &p->iph.saddr, 4);
 814                memcpy(dev->broadcast, &p->iph.daddr, 4);
 815        }
 816        ip_tunnel_add(itn, t);
 817
 818        t->parms.iph.ttl = p->iph.ttl;
 819        t->parms.iph.tos = p->iph.tos;
 820        t->parms.iph.frag_off = p->iph.frag_off;
 821
 822        if (t->parms.link != p->link) {
 823                int mtu;
 824
 825                t->parms.link = p->link;
 826                mtu = ip_tunnel_bind_dev(dev);
 827                if (set_mtu)
 828                        dev->mtu = mtu;
 829        }
 830        ip_tunnel_dst_reset_all(t);
 831        netdev_state_change(dev);
 832}
 833
 834int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
 835{
 836        int err = 0;
 837        struct ip_tunnel *t = netdev_priv(dev);
 838        struct net *net = t->net;
 839        struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
 840
 841        BUG_ON(!itn->fb_tunnel_dev);
 842        switch (cmd) {
 843        case SIOCGETTUNNEL:
 844                if (dev == itn->fb_tunnel_dev) {
 845                        t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 846                        if (!t)
 847                                t = netdev_priv(dev);
 848                }
 849                memcpy(p, &t->parms, sizeof(*p));
 850                break;
 851
 852        case SIOCADDTUNNEL:
 853        case SIOCCHGTUNNEL:
 854                err = -EPERM;
 855                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 856                        goto done;
 857                if (p->iph.ttl)
 858                        p->iph.frag_off |= htons(IP_DF);
 859                if (!(p->i_flags & VTI_ISVTI)) {
 860                        if (!(p->i_flags & TUNNEL_KEY))
 861                                p->i_key = 0;
 862                        if (!(p->o_flags & TUNNEL_KEY))
 863                                p->o_key = 0;
 864                }
 865
 866                t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 867
 868                if (cmd == SIOCADDTUNNEL) {
 869                        if (!t) {
 870                                t = ip_tunnel_create(net, itn, p);
 871                                err = PTR_ERR_OR_ZERO(t);
 872                                break;
 873                        }
 874
 875                        err = -EEXIST;
 876                        break;
 877                }
 878                if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
 879                        if (t) {
 880                                if (t->dev != dev) {
 881                                        err = -EEXIST;
 882                                        break;
 883                                }
 884                        } else {
 885                                unsigned int nflags = 0;
 886
 887                                if (ipv4_is_multicast(p->iph.daddr))
 888                                        nflags = IFF_BROADCAST;
 889                                else if (p->iph.daddr)
 890                                        nflags = IFF_POINTOPOINT;
 891
 892                                if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
 893                                        err = -EINVAL;
 894                                        break;
 895                                }
 896
 897                                t = netdev_priv(dev);
 898                        }
 899                }
 900
 901                if (t) {
 902                        err = 0;
 903                        ip_tunnel_update(itn, t, dev, p, true);
 904                } else {
 905                        err = -ENOENT;
 906                }
 907                break;
 908
 909        case SIOCDELTUNNEL:
 910                err = -EPERM;
 911                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 912                        goto done;
 913
 914                if (dev == itn->fb_tunnel_dev) {
 915                        err = -ENOENT;
 916                        t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 917                        if (!t)
 918                                goto done;
 919                        err = -EPERM;
 920                        if (t == netdev_priv(itn->fb_tunnel_dev))
 921                                goto done;
 922                        dev = t->dev;
 923                }
 924                unregister_netdevice(dev);
 925                err = 0;
 926                break;
 927
 928        default:
 929                err = -EINVAL;
 930        }
 931
 932done:
 933        return err;
 934}
 935EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
 936
 937int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
 938{
 939        struct ip_tunnel *tunnel = netdev_priv(dev);
 940        int t_hlen = tunnel->hlen + sizeof(struct iphdr);
 941
 942        if (new_mtu < 68 ||
 943            new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
 944                return -EINVAL;
 945        dev->mtu = new_mtu;
 946        return 0;
 947}
 948EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
 949
 950static void ip_tunnel_dev_free(struct net_device *dev)
 951{
 952        struct ip_tunnel *tunnel = netdev_priv(dev);
 953
 954        gro_cells_destroy(&tunnel->gro_cells);
 955        free_percpu(tunnel->dst_cache);
 956        free_percpu(dev->tstats);
 957        free_netdev(dev);
 958}
 959
 960void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
 961{
 962        struct ip_tunnel *tunnel = netdev_priv(dev);
 963        struct ip_tunnel_net *itn;
 964
 965        itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
 966
 967        if (itn->fb_tunnel_dev != dev) {
 968                ip_tunnel_del(netdev_priv(dev));
 969                unregister_netdevice_queue(dev, head);
 970        }
 971}
 972EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
 973
 974struct net *ip_tunnel_get_link_net(const struct net_device *dev)
 975{
 976        struct ip_tunnel *tunnel = netdev_priv(dev);
 977
 978        return tunnel->net;
 979}
 980EXPORT_SYMBOL(ip_tunnel_get_link_net);
 981
 982int ip_tunnel_get_iflink(const struct net_device *dev)
 983{
 984        struct ip_tunnel *tunnel = netdev_priv(dev);
 985
 986        return tunnel->parms.link;
 987}
 988EXPORT_SYMBOL(ip_tunnel_get_iflink);
 989
 990int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
 991                                  struct rtnl_link_ops *ops, char *devname)
 992{
 993        struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
 994        struct ip_tunnel_parm parms;
 995        unsigned int i;
 996
 997        for (i = 0; i < IP_TNL_HASH_SIZE; i++)
 998                INIT_HLIST_HEAD(&itn->tunnels[i]);
 999
1000        if (!ops) {
1001                itn->fb_tunnel_dev = NULL;
1002                return 0;
1003        }
1004
1005        memset(&parms, 0, sizeof(parms));
1006        if (devname)
1007                strlcpy(parms.name, devname, IFNAMSIZ);
1008
1009        rtnl_lock();
1010        itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1011        /* FB netdevice is special: we have one, and only one per netns.
1012         * Allowing to move it to another netns is clearly unsafe.
1013         */
1014        if (!IS_ERR(itn->fb_tunnel_dev)) {
1015                itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1016                itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1017                ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1018        }
1019        rtnl_unlock();
1020
1021        return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1022}
1023EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1024
1025static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1026                              struct rtnl_link_ops *ops)
1027{
1028        struct net *net = dev_net(itn->fb_tunnel_dev);
1029        struct net_device *dev, *aux;
1030        int h;
1031
1032        for_each_netdev_safe(net, dev, aux)
1033                if (dev->rtnl_link_ops == ops)
1034                        unregister_netdevice_queue(dev, head);
1035
1036        for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1037                struct ip_tunnel *t;
1038                struct hlist_node *n;
1039                struct hlist_head *thead = &itn->tunnels[h];
1040
1041                hlist_for_each_entry_safe(t, n, thead, hash_node)
1042                        /* If dev is in the same netns, it has already
1043                         * been added to the list by the previous loop.
1044                         */
1045                        if (!net_eq(dev_net(t->dev), net))
1046                                unregister_netdevice_queue(t->dev, head);
1047        }
1048}
1049
1050void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1051{
1052        LIST_HEAD(list);
1053
1054        rtnl_lock();
1055        ip_tunnel_destroy(itn, &list, ops);
1056        unregister_netdevice_many(&list);
1057        rtnl_unlock();
1058}
1059EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1060
1061int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1062                      struct ip_tunnel_parm *p)
1063{
1064        struct ip_tunnel *nt;
1065        struct net *net = dev_net(dev);
1066        struct ip_tunnel_net *itn;
1067        int mtu;
1068        int err;
1069
1070        nt = netdev_priv(dev);
1071        itn = net_generic(net, nt->ip_tnl_net_id);
1072
1073        if (ip_tunnel_find(itn, p, dev->type))
1074                return -EEXIST;
1075
1076        nt->net = net;
1077        nt->parms = *p;
1078        err = register_netdevice(dev);
1079        if (err)
1080                goto out;
1081
1082        if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1083                eth_hw_addr_random(dev);
1084
1085        mtu = ip_tunnel_bind_dev(dev);
1086        if (!tb[IFLA_MTU])
1087                dev->mtu = mtu;
1088
1089        ip_tunnel_add(itn, nt);
1090
1091out:
1092        return err;
1093}
1094EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1095
1096int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1097                         struct ip_tunnel_parm *p)
1098{
1099        struct ip_tunnel *t;
1100        struct ip_tunnel *tunnel = netdev_priv(dev);
1101        struct net *net = tunnel->net;
1102        struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1103
1104        if (dev == itn->fb_tunnel_dev)
1105                return -EINVAL;
1106
1107        t = ip_tunnel_find(itn, p, dev->type);
1108
1109        if (t) {
1110                if (t->dev != dev)
1111                        return -EEXIST;
1112        } else {
1113                t = tunnel;
1114
1115                if (dev->type != ARPHRD_ETHER) {
1116                        unsigned int nflags = 0;
1117
1118                        if (ipv4_is_multicast(p->iph.daddr))
1119                                nflags = IFF_BROADCAST;
1120                        else if (p->iph.daddr)
1121                                nflags = IFF_POINTOPOINT;
1122
1123                        if ((dev->flags ^ nflags) &
1124                            (IFF_POINTOPOINT | IFF_BROADCAST))
1125                                return -EINVAL;
1126                }
1127        }
1128
1129        ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1130        return 0;
1131}
1132EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1133
1134int ip_tunnel_init(struct net_device *dev)
1135{
1136        struct ip_tunnel *tunnel = netdev_priv(dev);
1137        struct iphdr *iph = &tunnel->parms.iph;
1138        int err;
1139
1140        dev->destructor = ip_tunnel_dev_free;
1141        dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1142        if (!dev->tstats)
1143                return -ENOMEM;
1144
1145        tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1146        if (!tunnel->dst_cache) {
1147                free_percpu(dev->tstats);
1148                return -ENOMEM;
1149        }
1150
1151        err = gro_cells_init(&tunnel->gro_cells, dev);
1152        if (err) {
1153                free_percpu(tunnel->dst_cache);
1154                free_percpu(dev->tstats);
1155                return err;
1156        }
1157
1158        tunnel->dev = dev;
1159        tunnel->net = dev_net(dev);
1160        strcpy(tunnel->parms.name, dev->name);
1161        iph->version            = 4;
1162        iph->ihl                = 5;
1163
1164        return 0;
1165}
1166EXPORT_SYMBOL_GPL(ip_tunnel_init);
1167
1168void ip_tunnel_uninit(struct net_device *dev)
1169{
1170        struct ip_tunnel *tunnel = netdev_priv(dev);
1171        struct net *net = tunnel->net;
1172        struct ip_tunnel_net *itn;
1173
1174        itn = net_generic(net, tunnel->ip_tnl_net_id);
1175        /* fb_tunnel_dev will be unregisted in net-exit call. */
1176        if (itn->fb_tunnel_dev != dev)
1177                ip_tunnel_del(netdev_priv(dev));
1178
1179        ip_tunnel_dst_reset_all(tunnel);
1180}
1181EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1182
1183/* Do least required initialization, rest of init is done in tunnel_init call */
1184void ip_tunnel_setup(struct net_device *dev, int net_id)
1185{
1186        struct ip_tunnel *tunnel = netdev_priv(dev);
1187        tunnel->ip_tnl_net_id = net_id;
1188}
1189EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1190
1191MODULE_LICENSE("GPL");
1192