linux/net/ipv4/ip_tunnel.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2013 Nicira, Inc.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of version 2 of the GNU General Public
   6 * License as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful, but
   9 * WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public License
  14 * along with this program; if not, write to the Free Software
  15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16 * 02110-1301, USA
  17 */
  18
  19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  20
  21#include <linux/capability.h>
  22#include <linux/module.h>
  23#include <linux/types.h>
  24#include <linux/kernel.h>
  25#include <linux/slab.h>
  26#include <linux/uaccess.h>
  27#include <linux/skbuff.h>
  28#include <linux/netdevice.h>
  29#include <linux/in.h>
  30#include <linux/tcp.h>
  31#include <linux/udp.h>
  32#include <linux/if_arp.h>
  33#include <linux/init.h>
  34#include <linux/in6.h>
  35#include <linux/inetdevice.h>
  36#include <linux/igmp.h>
  37#include <linux/netfilter_ipv4.h>
  38#include <linux/etherdevice.h>
  39#include <linux/if_ether.h>
  40#include <linux/if_vlan.h>
  41#include <linux/rculist.h>
  42
  43#include <net/sock.h>
  44#include <net/ip.h>
  45#include <net/icmp.h>
  46#include <net/protocol.h>
  47#include <net/ip_tunnels.h>
  48#include <net/arp.h>
  49#include <net/checksum.h>
  50#include <net/dsfield.h>
  51#include <net/inet_ecn.h>
  52#include <net/xfrm.h>
  53#include <net/net_namespace.h>
  54#include <net/netns/generic.h>
  55#include <net/rtnetlink.h>
  56#include <net/udp.h>
  57#include <net/dst_metadata.h>
  58
  59#if IS_ENABLED(CONFIG_IPV6)
  60#include <net/ipv6.h>
  61#include <net/ip6_fib.h>
  62#include <net/ip6_route.h>
  63#endif
  64
  65static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
  66{
  67        return hash_32((__force u32)key ^ (__force u32)remote,
  68                         IP_TNL_HASH_BITS);
  69}
  70
  71static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
  72                                __be16 flags, __be32 key)
  73{
  74        if (p->i_flags & TUNNEL_KEY) {
  75                if (flags & TUNNEL_KEY)
  76                        return key == p->i_key;
  77                else
  78                        /* key expected, none present */
  79                        return false;
  80        } else
  81                return !(flags & TUNNEL_KEY);
  82}
  83
  84/* Fallback tunnel: no source, no destination, no key, no options
  85
  86   Tunnel hash table:
  87   We require exact key match i.e. if a key is present in packet
  88   it will match only tunnel with the same key; if it is not present,
  89   it will match only keyless tunnel.
  90
  91   All keysless packets, if not matched configured keyless tunnels
  92   will match fallback tunnel.
  93   Given src, dst and key, find appropriate for input tunnel.
  94*/
  95struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
  96                                   int link, __be16 flags,
  97                                   __be32 remote, __be32 local,
  98                                   __be32 key)
  99{
 100        unsigned int hash;
 101        struct ip_tunnel *t, *cand = NULL;
 102        struct hlist_head *head;
 103
 104        hash = ip_tunnel_hash(key, remote);
 105        head = &itn->tunnels[hash];
 106
 107        hlist_for_each_entry_rcu(t, head, hash_node) {
 108                if (local != t->parms.iph.saddr ||
 109                    remote != t->parms.iph.daddr ||
 110                    !(t->dev->flags & IFF_UP))
 111                        continue;
 112
 113                if (!ip_tunnel_key_match(&t->parms, flags, key))
 114                        continue;
 115
 116                if (t->parms.link == link)
 117                        return t;
 118                else
 119                        cand = t;
 120        }
 121
 122        hlist_for_each_entry_rcu(t, head, hash_node) {
 123                if (remote != t->parms.iph.daddr ||
 124                    t->parms.iph.saddr != 0 ||
 125                    !(t->dev->flags & IFF_UP))
 126                        continue;
 127
 128                if (!ip_tunnel_key_match(&t->parms, flags, key))
 129                        continue;
 130
 131                if (t->parms.link == link)
 132                        return t;
 133                else if (!cand)
 134                        cand = t;
 135        }
 136
 137        hash = ip_tunnel_hash(key, 0);
 138        head = &itn->tunnels[hash];
 139
 140        hlist_for_each_entry_rcu(t, head, hash_node) {
 141                if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
 142                    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
 143                        continue;
 144
 145                if (!(t->dev->flags & IFF_UP))
 146                        continue;
 147
 148                if (!ip_tunnel_key_match(&t->parms, flags, key))
 149                        continue;
 150
 151                if (t->parms.link == link)
 152                        return t;
 153                else if (!cand)
 154                        cand = t;
 155        }
 156
 157        if (flags & TUNNEL_NO_KEY)
 158                goto skip_key_lookup;
 159
 160        hlist_for_each_entry_rcu(t, head, hash_node) {
 161                if (t->parms.i_key != key ||
 162                    t->parms.iph.saddr != 0 ||
 163                    t->parms.iph.daddr != 0 ||
 164                    !(t->dev->flags & IFF_UP))
 165                        continue;
 166
 167                if (t->parms.link == link)
 168                        return t;
 169                else if (!cand)
 170                        cand = t;
 171        }
 172
 173skip_key_lookup:
 174        if (cand)
 175                return cand;
 176
 177        t = rcu_dereference(itn->collect_md_tun);
 178        if (t && t->dev->flags & IFF_UP)
 179                return t;
 180
 181        if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
 182                return netdev_priv(itn->fb_tunnel_dev);
 183
 184        return NULL;
 185}
 186EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
 187
 188static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
 189                                    struct ip_tunnel_parm *parms)
 190{
 191        unsigned int h;
 192        __be32 remote;
 193        __be32 i_key = parms->i_key;
 194
 195        if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
 196                remote = parms->iph.daddr;
 197        else
 198                remote = 0;
 199
 200        if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
 201                i_key = 0;
 202
 203        h = ip_tunnel_hash(i_key, remote);
 204        return &itn->tunnels[h];
 205}
 206
 207static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 208{
 209        struct hlist_head *head = ip_bucket(itn, &t->parms);
 210
 211        if (t->collect_md)
 212                rcu_assign_pointer(itn->collect_md_tun, t);
 213        hlist_add_head_rcu(&t->hash_node, head);
 214}
 215
 216static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 217{
 218        if (t->collect_md)
 219                rcu_assign_pointer(itn->collect_md_tun, NULL);
 220        hlist_del_init_rcu(&t->hash_node);
 221}
 222
 223static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
 224                                        struct ip_tunnel_parm *parms,
 225                                        int type)
 226{
 227        __be32 remote = parms->iph.daddr;
 228        __be32 local = parms->iph.saddr;
 229        __be32 key = parms->i_key;
 230        int link = parms->link;
 231        struct ip_tunnel *t = NULL;
 232        struct hlist_head *head = ip_bucket(itn, parms);
 233
 234        hlist_for_each_entry_rcu(t, head, hash_node) {
 235                if (local == t->parms.iph.saddr &&
 236                    remote == t->parms.iph.daddr &&
 237                    key == t->parms.i_key &&
 238                    link == t->parms.link &&
 239                    type == t->dev->type)
 240                        break;
 241        }
 242        return t;
 243}
 244
 245static struct net_device *__ip_tunnel_create(struct net *net,
 246                                             const struct rtnl_link_ops *ops,
 247                                             struct ip_tunnel_parm *parms)
 248{
 249        int err;
 250        struct ip_tunnel *tunnel;
 251        struct net_device *dev;
 252        char name[IFNAMSIZ];
 253
 254        err = -E2BIG;
 255        if (parms->name[0]) {
 256                if (!dev_valid_name(parms->name))
 257                        goto failed;
 258                strlcpy(name, parms->name, IFNAMSIZ);
 259        } else {
 260                if (strlen(ops->kind) > (IFNAMSIZ - 3))
 261                        goto failed;
 262                strlcpy(name, ops->kind, IFNAMSIZ);
 263                strncat(name, "%d", 2);
 264        }
 265
 266        ASSERT_RTNL();
 267        dev = alloc_netdev(ops->priv_size, name, ops->setup);
 268        if (!dev) {
 269                err = -ENOMEM;
 270                goto failed;
 271        }
 272        dev_net_set(dev, net);
 273
 274        dev->rtnl_link_ops = ops;
 275
 276        tunnel = netdev_priv(dev);
 277        tunnel->parms = *parms;
 278        tunnel->net = net;
 279
 280        err = register_netdevice(dev);
 281        if (err)
 282                goto failed_free;
 283
 284        return dev;
 285
 286failed_free:
 287        free_netdev(dev);
 288failed:
 289        return ERR_PTR(err);
 290}
 291
 292static int ip_tunnel_bind_dev(struct net_device *dev)
 293{
 294        struct net_device *tdev = NULL;
 295        struct ip_tunnel *tunnel = netdev_priv(dev);
 296        const struct iphdr *iph;
 297        int hlen = LL_MAX_HEADER;
 298        int mtu = ETH_DATA_LEN;
 299        int t_hlen = tunnel->hlen + sizeof(struct iphdr);
 300
 301        iph = &tunnel->parms.iph;
 302
 303        /* Guess output device to choose reasonable mtu and needed_headroom */
 304        if (iph->daddr) {
 305                struct flowi4 fl4;
 306                struct rtable *rt;
 307
 308                ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
 309                                    iph->saddr, tunnel->parms.o_key,
 310                                    RT_TOS(iph->tos), tunnel->parms.link);
 311                rt = ip_route_output_key(tunnel->net, &fl4);
 312
 313                if (!IS_ERR(rt)) {
 314                        tdev = rt->dst.dev;
 315                        ip_rt_put(rt);
 316                }
 317                if (dev->type != ARPHRD_ETHER)
 318                        dev->flags |= IFF_POINTOPOINT;
 319
 320                dst_cache_reset(&tunnel->dst_cache);
 321        }
 322
 323        if (!tdev && tunnel->parms.link)
 324                tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
 325
 326        if (tdev) {
 327                hlen = tdev->hard_header_len + tdev->needed_headroom;
 328                mtu = min(tdev->mtu, IP_MAX_MTU);
 329        }
 330
 331        dev->needed_headroom = t_hlen + hlen;
 332        mtu -= (dev->hard_header_len + t_hlen);
 333
 334        if (mtu < 68)
 335                mtu = 68;
 336
 337        return mtu;
 338}
 339
 340static struct ip_tunnel *ip_tunnel_create(struct net *net,
 341                                          struct ip_tunnel_net *itn,
 342                                          struct ip_tunnel_parm *parms)
 343{
 344        struct ip_tunnel *nt, *fbt;
 345        struct net_device *dev;
 346        int mtu;
 347        int err;
 348
 349        BUG_ON(!itn->fb_tunnel_dev);
 350        fbt = netdev_priv(itn->fb_tunnel_dev);
 351        dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
 352        if (IS_ERR(dev))
 353                return ERR_CAST(dev);
 354
 355        mtu = ip_tunnel_bind_dev(dev);
 356        err = dev_set_mtu(dev, mtu);
 357        if (err)
 358                goto err_dev_set_mtu;
 359
 360        nt = netdev_priv(dev);
 361        ip_tunnel_add(itn, nt);
 362        return nt;
 363
 364err_dev_set_mtu:
 365        unregister_netdevice(dev);
 366        return ERR_PTR(err);
 367}
 368
 369int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 370                  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
 371                  bool log_ecn_error)
 372{
 373        struct pcpu_sw_netstats *tstats;
 374        const struct iphdr *iph = ip_hdr(skb);
 375        int err;
 376
 377#ifdef CONFIG_NET_IPGRE_BROADCAST
 378        if (ipv4_is_multicast(iph->daddr)) {
 379                tunnel->dev->stats.multicast++;
 380                skb->pkt_type = PACKET_BROADCAST;
 381        }
 382#endif
 383
 384        if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
 385             ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
 386                tunnel->dev->stats.rx_crc_errors++;
 387                tunnel->dev->stats.rx_errors++;
 388                goto drop;
 389        }
 390
 391        if (tunnel->parms.i_flags&TUNNEL_SEQ) {
 392                if (!(tpi->flags&TUNNEL_SEQ) ||
 393                    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
 394                        tunnel->dev->stats.rx_fifo_errors++;
 395                        tunnel->dev->stats.rx_errors++;
 396                        goto drop;
 397                }
 398                tunnel->i_seqno = ntohl(tpi->seq) + 1;
 399        }
 400
 401        skb_reset_network_header(skb);
 402
 403        err = IP_ECN_decapsulate(iph, skb);
 404        if (unlikely(err)) {
 405                if (log_ecn_error)
 406                        net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
 407                                        &iph->saddr, iph->tos);
 408                if (err > 1) {
 409                        ++tunnel->dev->stats.rx_frame_errors;
 410                        ++tunnel->dev->stats.rx_errors;
 411                        goto drop;
 412                }
 413        }
 414
 415        tstats = this_cpu_ptr(tunnel->dev->tstats);
 416        u64_stats_update_begin(&tstats->syncp);
 417        tstats->rx_packets++;
 418        tstats->rx_bytes += skb->len;
 419        u64_stats_update_end(&tstats->syncp);
 420
 421        skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
 422
 423        if (tunnel->dev->type == ARPHRD_ETHER) {
 424                skb->protocol = eth_type_trans(skb, tunnel->dev);
 425                skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 426        } else {
 427                skb->dev = tunnel->dev;
 428        }
 429
 430        if (tun_dst)
 431                skb_dst_set(skb, (struct dst_entry *)tun_dst);
 432
 433        gro_cells_receive(&tunnel->gro_cells, skb);
 434        return 0;
 435
 436drop:
 437        if (tun_dst)
 438                dst_release((struct dst_entry *)tun_dst);
 439        kfree_skb(skb);
 440        return 0;
 441}
 442EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
 443
 444static int ip_encap_hlen(struct ip_tunnel_encap *e)
 445{
 446        const struct ip_tunnel_encap_ops *ops;
 447        int hlen = -EINVAL;
 448
 449        if (e->type == TUNNEL_ENCAP_NONE)
 450                return 0;
 451
 452        if (e->type >= MAX_IPTUN_ENCAP_OPS)
 453                return -EINVAL;
 454
 455        rcu_read_lock();
 456        ops = rcu_dereference(iptun_encaps[e->type]);
 457        if (likely(ops && ops->encap_hlen))
 458                hlen = ops->encap_hlen(e);
 459        rcu_read_unlock();
 460
 461        return hlen;
 462}
 463
 464const struct ip_tunnel_encap_ops __rcu *
 465                iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
 466
 467int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
 468                            unsigned int num)
 469{
 470        if (num >= MAX_IPTUN_ENCAP_OPS)
 471                return -ERANGE;
 472
 473        return !cmpxchg((const struct ip_tunnel_encap_ops **)
 474                        &iptun_encaps[num],
 475                        NULL, ops) ? 0 : -1;
 476}
 477EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
 478
 479int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
 480                            unsigned int num)
 481{
 482        int ret;
 483
 484        if (num >= MAX_IPTUN_ENCAP_OPS)
 485                return -ERANGE;
 486
 487        ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
 488                       &iptun_encaps[num],
 489                       ops, NULL) == ops) ? 0 : -1;
 490
 491        synchronize_net();
 492
 493        return ret;
 494}
 495EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
 496
 497int ip_tunnel_encap_setup(struct ip_tunnel *t,
 498                          struct ip_tunnel_encap *ipencap)
 499{
 500        int hlen;
 501
 502        memset(&t->encap, 0, sizeof(t->encap));
 503
 504        hlen = ip_encap_hlen(ipencap);
 505        if (hlen < 0)
 506                return hlen;
 507
 508        t->encap.type = ipencap->type;
 509        t->encap.sport = ipencap->sport;
 510        t->encap.dport = ipencap->dport;
 511        t->encap.flags = ipencap->flags;
 512
 513        t->encap_hlen = hlen;
 514        t->hlen = t->encap_hlen + t->tun_hlen;
 515
 516        return 0;
 517}
 518EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
 519
 520int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
 521                    u8 *protocol, struct flowi4 *fl4)
 522{
 523        const struct ip_tunnel_encap_ops *ops;
 524        int ret = -EINVAL;
 525
 526        if (t->encap.type == TUNNEL_ENCAP_NONE)
 527                return 0;
 528
 529        if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
 530                return -EINVAL;
 531
 532        rcu_read_lock();
 533        ops = rcu_dereference(iptun_encaps[t->encap.type]);
 534        if (likely(ops && ops->build_header))
 535                ret = ops->build_header(skb, &t->encap, protocol, fl4);
 536        rcu_read_unlock();
 537
 538        return ret;
 539}
 540EXPORT_SYMBOL(ip_tunnel_encap);
 541
 542static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
 543                            struct rtable *rt, __be16 df,
 544                            const struct iphdr *inner_iph)
 545{
 546        struct ip_tunnel *tunnel = netdev_priv(dev);
 547        int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
 548        int mtu;
 549
 550        if (df)
 551                mtu = dst_mtu(&rt->dst) - dev->hard_header_len
 552                                        - sizeof(struct iphdr) - tunnel->hlen;
 553        else
 554                mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 555
 556        if (skb_valid_dst(skb))
 557                skb_dst_update_pmtu(skb, mtu);
 558
 559        if (skb->protocol == htons(ETH_P_IP)) {
 560                if (!skb_is_gso(skb) &&
 561                    (inner_iph->frag_off & htons(IP_DF)) &&
 562                    mtu < pkt_size) {
 563                        memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
 564                        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 565                        return -E2BIG;
 566                }
 567        }
 568#if IS_ENABLED(CONFIG_IPV6)
 569        else if (skb->protocol == htons(ETH_P_IPV6)) {
 570                struct rt6_info *rt6;
 571
 572                rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
 573                                           NULL;
 574
 575                if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
 576                           mtu >= IPV6_MIN_MTU) {
 577                        if ((tunnel->parms.iph.daddr &&
 578                            !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
 579                            rt6->rt6i_dst.plen == 128) {
 580                                rt6->rt6i_flags |= RTF_MODIFIED;
 581                                dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
 582                        }
 583                }
 584
 585                if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
 586                                        mtu < pkt_size) {
 587                        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 588                        return -E2BIG;
 589                }
 590        }
 591#endif
 592        return 0;
 593}
 594
 595void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 596                    const struct iphdr *tnl_params, u8 protocol)
 597{
 598        struct ip_tunnel *tunnel = netdev_priv(dev);
 599        unsigned int inner_nhdr_len = 0;
 600        const struct iphdr *inner_iph;
 601        struct flowi4 fl4;
 602        u8     tos, ttl;
 603        __be16 df;
 604        struct rtable *rt;              /* Route to the other host */
 605        unsigned int max_headroom;      /* The extra header space needed */
 606        __be32 dst;
 607        bool connected;
 608
 609        /* ensure we can access the inner net header, for several users below */
 610        if (skb->protocol == htons(ETH_P_IP))
 611                inner_nhdr_len = sizeof(struct iphdr);
 612        else if (skb->protocol == htons(ETH_P_IPV6))
 613                inner_nhdr_len = sizeof(struct ipv6hdr);
 614        if (unlikely(!pskb_may_pull(skb, inner_nhdr_len)))
 615                goto tx_error;
 616
 617        inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
 618        connected = (tunnel->parms.iph.daddr != 0);
 619
 620        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 621
 622        dst = tnl_params->daddr;
 623        if (dst == 0) {
 624                /* NBMA tunnel */
 625
 626                if (skb_dst(skb) == NULL) {
 627                        dev->stats.tx_fifo_errors++;
 628                        goto tx_error;
 629                }
 630
 631                if (skb->protocol == htons(ETH_P_IP)) {
 632                        rt = skb_rtable(skb);
 633                        dst = rt_nexthop(rt, inner_iph->daddr);
 634                }
 635#if IS_ENABLED(CONFIG_IPV6)
 636                else if (skb->protocol == htons(ETH_P_IPV6)) {
 637                        const struct in6_addr *addr6;
 638                        struct neighbour *neigh;
 639                        bool do_tx_error_icmp;
 640                        int addr_type;
 641
 642                        neigh = dst_neigh_lookup(skb_dst(skb),
 643                                                 &ipv6_hdr(skb)->daddr);
 644                        if (neigh == NULL)
 645                                goto tx_error;
 646
 647                        addr6 = (const struct in6_addr *)&neigh->primary_key;
 648                        addr_type = ipv6_addr_type(addr6);
 649
 650                        if (addr_type == IPV6_ADDR_ANY) {
 651                                addr6 = &ipv6_hdr(skb)->daddr;
 652                                addr_type = ipv6_addr_type(addr6);
 653                        }
 654
 655                        if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
 656                                do_tx_error_icmp = true;
 657                        else {
 658                                do_tx_error_icmp = false;
 659                                dst = addr6->s6_addr32[3];
 660                        }
 661                        neigh_release(neigh);
 662                        if (do_tx_error_icmp)
 663                                goto tx_error_icmp;
 664                }
 665#endif
 666                else
 667                        goto tx_error;
 668
 669                connected = false;
 670        }
 671
 672        tos = tnl_params->tos;
 673        if (tos & 0x1) {
 674                tos &= ~0x1;
 675                if (skb->protocol == htons(ETH_P_IP)) {
 676                        tos = inner_iph->tos;
 677                        connected = false;
 678                } else if (skb->protocol == htons(ETH_P_IPV6)) {
 679                        tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
 680                        connected = false;
 681                }
 682        }
 683
 684        ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
 685                            tunnel->parms.o_key, RT_TOS(tos),
 686                            tunnel->parms.link);
 687
 688        if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
 689                goto tx_error;
 690
 691        rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
 692                         NULL;
 693
 694        if (!rt) {
 695                rt = ip_route_output_key(tunnel->net, &fl4);
 696
 697                if (IS_ERR(rt)) {
 698                        dev->stats.tx_carrier_errors++;
 699                        goto tx_error;
 700                }
 701                if (connected)
 702                        dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
 703                                          fl4.saddr);
 704        }
 705
 706        if (rt->dst.dev == dev) {
 707                ip_rt_put(rt);
 708                dev->stats.collisions++;
 709                goto tx_error;
 710        }
 711
 712        if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
 713                ip_rt_put(rt);
 714                goto tx_error;
 715        }
 716
 717        if (tunnel->err_count > 0) {
 718                if (time_before(jiffies,
 719                                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
 720                        tunnel->err_count--;
 721
 722                        dst_link_failure(skb);
 723                } else
 724                        tunnel->err_count = 0;
 725        }
 726
 727        tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
 728        ttl = tnl_params->ttl;
 729        if (ttl == 0) {
 730                if (skb->protocol == htons(ETH_P_IP))
 731                        ttl = inner_iph->ttl;
 732#if IS_ENABLED(CONFIG_IPV6)
 733                else if (skb->protocol == htons(ETH_P_IPV6))
 734                        ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
 735#endif
 736                else
 737                        ttl = ip4_dst_hoplimit(&rt->dst);
 738        }
 739
 740        df = tnl_params->frag_off;
 741        if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
 742                df |= (inner_iph->frag_off&htons(IP_DF));
 743
 744        max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
 745                        + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
 746        if (max_headroom > dev->needed_headroom)
 747                dev->needed_headroom = max_headroom;
 748
 749        if (skb_cow_head(skb, dev->needed_headroom)) {
 750                ip_rt_put(rt);
 751                dev->stats.tx_dropped++;
 752                kfree_skb(skb);
 753                return;
 754        }
 755
 756        iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
 757                      df, !net_eq(tunnel->net, dev_net(dev)));
 758        return;
 759
 760#if IS_ENABLED(CONFIG_IPV6)
 761tx_error_icmp:
 762        dst_link_failure(skb);
 763#endif
 764tx_error:
 765        dev->stats.tx_errors++;
 766        kfree_skb(skb);
 767}
 768EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
 769
 770static void ip_tunnel_update(struct ip_tunnel_net *itn,
 771                             struct ip_tunnel *t,
 772                             struct net_device *dev,
 773                             struct ip_tunnel_parm *p,
 774                             bool set_mtu)
 775{
 776        ip_tunnel_del(itn, t);
 777        t->parms.iph.saddr = p->iph.saddr;
 778        t->parms.iph.daddr = p->iph.daddr;
 779        t->parms.i_key = p->i_key;
 780        t->parms.o_key = p->o_key;
 781        if (dev->type != ARPHRD_ETHER) {
 782                memcpy(dev->dev_addr, &p->iph.saddr, 4);
 783                memcpy(dev->broadcast, &p->iph.daddr, 4);
 784        }
 785        ip_tunnel_add(itn, t);
 786
 787        t->parms.iph.ttl = p->iph.ttl;
 788        t->parms.iph.tos = p->iph.tos;
 789        t->parms.iph.frag_off = p->iph.frag_off;
 790
 791        if (t->parms.link != p->link) {
 792                int mtu;
 793
 794                t->parms.link = p->link;
 795                mtu = ip_tunnel_bind_dev(dev);
 796                if (set_mtu)
 797                        dev->mtu = mtu;
 798        }
 799        dst_cache_reset(&t->dst_cache);
 800        netdev_state_change(dev);
 801}
 802
 803int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
 804{
 805        int err = 0;
 806        struct ip_tunnel *t = netdev_priv(dev);
 807        struct net *net = t->net;
 808        struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
 809
 810        BUG_ON(!itn->fb_tunnel_dev);
 811        switch (cmd) {
 812        case SIOCGETTUNNEL:
 813                if (dev == itn->fb_tunnel_dev) {
 814                        t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 815                        if (t == NULL)
 816                                t = netdev_priv(dev);
 817                }
 818                memcpy(p, &t->parms, sizeof(*p));
 819                break;
 820
 821        case SIOCADDTUNNEL:
 822        case SIOCCHGTUNNEL:
 823                err = -EPERM;
 824                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 825                        goto done;
 826                if (p->iph.ttl)
 827                        p->iph.frag_off |= htons(IP_DF);
 828                if (!(p->i_flags & VTI_ISVTI)) {
 829                        if (!(p->i_flags & TUNNEL_KEY))
 830                                p->i_key = 0;
 831                        if (!(p->o_flags & TUNNEL_KEY))
 832                                p->o_key = 0;
 833                }
 834
 835                t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 836
 837                if (cmd == SIOCADDTUNNEL) {
 838                        if (!t) {
 839                                t = ip_tunnel_create(net, itn, p);
 840                                err = PTR_RET(t);
 841                                break;
 842                        }
 843
 844                        err = -EEXIST;
 845                        break;
 846                }
 847                if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
 848                        if (t != NULL) {
 849                                if (t->dev != dev) {
 850                                        err = -EEXIST;
 851                                        break;
 852                                }
 853                        } else {
 854                                unsigned int nflags = 0;
 855
 856                                if (ipv4_is_multicast(p->iph.daddr))
 857                                        nflags = IFF_BROADCAST;
 858                                else if (p->iph.daddr)
 859                                        nflags = IFF_POINTOPOINT;
 860
 861                                if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
 862                                        err = -EINVAL;
 863                                        break;
 864                                }
 865
 866                                t = netdev_priv(dev);
 867                        }
 868                }
 869
 870                if (t) {
 871                        err = 0;
 872                        ip_tunnel_update(itn, t, dev, p, true);
 873                } else {
 874                        err = -ENOENT;
 875                }
 876                break;
 877
 878        case SIOCDELTUNNEL:
 879                err = -EPERM;
 880                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 881                        goto done;
 882
 883                if (dev == itn->fb_tunnel_dev) {
 884                        err = -ENOENT;
 885                        t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 886                        if (t == NULL)
 887                                goto done;
 888                        err = -EPERM;
 889                        if (t == netdev_priv(itn->fb_tunnel_dev))
 890                                goto done;
 891                        dev = t->dev;
 892                }
 893                unregister_netdevice(dev);
 894                err = 0;
 895                break;
 896
 897        default:
 898                err = -EINVAL;
 899        }
 900
 901done:
 902        return err;
 903}
 904EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
 905
 906int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
 907{
 908        struct ip_tunnel *tunnel = netdev_priv(dev);
 909        int t_hlen = tunnel->hlen + sizeof(struct iphdr);
 910        int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
 911
 912        if (new_mtu < 68)
 913                return -EINVAL;
 914
 915        if (new_mtu > max_mtu) {
 916                if (strict)
 917                        return -EINVAL;
 918
 919                new_mtu = max_mtu;
 920        }
 921
 922        dev->mtu = new_mtu;
 923        return 0;
 924}
 925EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
 926
 927int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
 928{
 929        return __ip_tunnel_change_mtu(dev, new_mtu, true);
 930}
 931EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
 932
 933static void ip_tunnel_dev_free(struct net_device *dev)
 934{
 935        struct ip_tunnel *tunnel = netdev_priv(dev);
 936
 937        gro_cells_destroy(&tunnel->gro_cells);
 938        dst_cache_destroy(&tunnel->dst_cache);
 939        free_percpu(dev->tstats);
 940}
 941
 942void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
 943{
 944        struct ip_tunnel *tunnel = netdev_priv(dev);
 945        struct ip_tunnel_net *itn;
 946
 947        itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
 948
 949        if (itn->fb_tunnel_dev != dev) {
 950                ip_tunnel_del(itn, netdev_priv(dev));
 951                unregister_netdevice_queue(dev, head);
 952        }
 953}
 954EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
 955
 956struct net *ip_tunnel_get_link_net(const struct net_device *dev)
 957{
 958        struct ip_tunnel *tunnel = netdev_priv(dev);
 959
 960        return tunnel->net;
 961}
 962EXPORT_SYMBOL(ip_tunnel_get_link_net);
 963
 964int ip_tunnel_get_iflink(const struct net_device *dev)
 965{
 966        struct ip_tunnel *tunnel = netdev_priv(dev);
 967
 968        return tunnel->parms.link;
 969}
 970EXPORT_SYMBOL(ip_tunnel_get_iflink);
 971
 972int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
 973                                  struct rtnl_link_ops *ops, char *devname)
 974{
 975        struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
 976        struct ip_tunnel_parm parms;
 977        unsigned int i;
 978
 979        for (i = 0; i < IP_TNL_HASH_SIZE; i++)
 980                INIT_HLIST_HEAD(&itn->tunnels[i]);
 981
 982        if (!ops) {
 983                itn->fb_tunnel_dev = NULL;
 984                return 0;
 985        }
 986
 987        memset(&parms, 0, sizeof(parms));
 988        if (devname)
 989                strlcpy(parms.name, devname, IFNAMSIZ);
 990
 991        rtnl_lock();
 992        itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
 993        /* FB netdevice is special: we have one, and only one per netns.
 994         * Allowing to move it to another netns is clearly unsafe.
 995         */
 996        if (!IS_ERR(itn->fb_tunnel_dev)) {
 997                itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
 998                itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
 999                ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1000        }
1001        rtnl_unlock();
1002
1003        if (IS_ERR(itn->fb_tunnel_dev))
1004                return PTR_ERR(itn->fb_tunnel_dev);
1005
1006        return 0;
1007}
1008EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1009
1010static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1011                              struct rtnl_link_ops *ops)
1012{
1013        struct net *net = dev_net(itn->fb_tunnel_dev);
1014        struct net_device *dev, *aux;
1015        int h;
1016
1017        for_each_netdev_safe(net, dev, aux)
1018                if (dev->rtnl_link_ops == ops)
1019                        unregister_netdevice_queue(dev, head);
1020
1021        for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1022                struct ip_tunnel *t;
1023                struct hlist_node *n;
1024                struct hlist_head *thead = &itn->tunnels[h];
1025
1026                hlist_for_each_entry_safe(t, n, thead, hash_node)
1027                        /* If dev is in the same netns, it has already
1028                         * been added to the list by the previous loop.
1029                         */
1030                        if (!net_eq(dev_net(t->dev), net))
1031                                unregister_netdevice_queue(t->dev, head);
1032        }
1033}
1034
1035void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1036{
1037        LIST_HEAD(list);
1038
1039        rtnl_lock();
1040        ip_tunnel_destroy(itn, &list, ops);
1041        unregister_netdevice_many(&list);
1042        rtnl_unlock();
1043}
1044EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1045
1046int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1047                      struct ip_tunnel_parm *p)
1048{
1049        struct ip_tunnel *nt;
1050        struct net *net = dev_net(dev);
1051        struct ip_tunnel_net *itn;
1052        int mtu;
1053        int err;
1054
1055        nt = netdev_priv(dev);
1056        itn = net_generic(net, nt->ip_tnl_net_id);
1057
1058        if (nt->collect_md) {
1059                if (rtnl_dereference(itn->collect_md_tun))
1060                        return -EEXIST;
1061        } else {
1062                if (ip_tunnel_find(itn, p, dev->type))
1063                        return -EEXIST;
1064        }
1065
1066        nt->net = net;
1067        nt->parms = *p;
1068        err = register_netdevice(dev);
1069        if (err)
1070                goto err_register_netdevice;
1071
1072        if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1073                eth_hw_addr_random(dev);
1074
1075        mtu = ip_tunnel_bind_dev(dev);
1076        if (tb[IFLA_MTU]) {
1077                unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen;
1078
1079                mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1080                            (unsigned int)(max - sizeof(struct iphdr)));
1081        }
1082
1083        err = dev_set_mtu(dev, mtu);
1084        if (err)
1085                goto err_dev_set_mtu;
1086
1087        ip_tunnel_add(itn, nt);
1088        return 0;
1089
1090err_dev_set_mtu:
1091        unregister_netdevice(dev);
1092err_register_netdevice:
1093        return err;
1094}
1095EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1096
1097int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1098                         struct ip_tunnel_parm *p)
1099{
1100        struct ip_tunnel *t;
1101        struct ip_tunnel *tunnel = netdev_priv(dev);
1102        struct net *net = tunnel->net;
1103        struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1104
1105        if (dev == itn->fb_tunnel_dev)
1106                return -EINVAL;
1107
1108        t = ip_tunnel_find(itn, p, dev->type);
1109
1110        if (t) {
1111                if (t->dev != dev)
1112                        return -EEXIST;
1113        } else {
1114                t = tunnel;
1115
1116                if (dev->type != ARPHRD_ETHER) {
1117                        unsigned int nflags = 0;
1118
1119                        if (ipv4_is_multicast(p->iph.daddr))
1120                                nflags = IFF_BROADCAST;
1121                        else if (p->iph.daddr)
1122                                nflags = IFF_POINTOPOINT;
1123
1124                        if ((dev->flags ^ nflags) &
1125                            (IFF_POINTOPOINT | IFF_BROADCAST))
1126                                return -EINVAL;
1127                }
1128        }
1129
1130        ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1131        return 0;
1132}
1133EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1134
1135int ip_tunnel_init(struct net_device *dev)
1136{
1137        struct ip_tunnel *tunnel = netdev_priv(dev);
1138        struct iphdr *iph = &tunnel->parms.iph;
1139        int err;
1140
1141        dev->extended->needs_free_netdev = true;
1142        dev->extended->priv_destructor = ip_tunnel_dev_free;
1143        dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1144        if (!dev->tstats)
1145                return -ENOMEM;
1146
1147        err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1148        if (err) {
1149                free_percpu(dev->tstats);
1150                return err;
1151        }
1152
1153        err = gro_cells_init(&tunnel->gro_cells, dev);
1154        if (err) {
1155                dst_cache_destroy(&tunnel->dst_cache);
1156                free_percpu(dev->tstats);
1157                return err;
1158        }
1159
1160        tunnel->dev = dev;
1161        tunnel->net = dev_net(dev);
1162        strcpy(tunnel->parms.name, dev->name);
1163        iph->version            = 4;
1164        iph->ihl                = 5;
1165
1166        if (tunnel->collect_md) {
1167                dev->features |= NETIF_F_NETNS_LOCAL;
1168                netif_keep_dst(dev);
1169        }
1170        return 0;
1171}
1172EXPORT_SYMBOL_GPL(ip_tunnel_init);
1173
1174void ip_tunnel_uninit(struct net_device *dev)
1175{
1176        struct ip_tunnel *tunnel = netdev_priv(dev);
1177        struct net *net = tunnel->net;
1178        struct ip_tunnel_net *itn;
1179
1180        itn = net_generic(net, tunnel->ip_tnl_net_id);
1181        /* fb_tunnel_dev will be unregisted in net-exit call. */
1182        if (itn->fb_tunnel_dev != dev)
1183                ip_tunnel_del(itn, netdev_priv(dev));
1184
1185        dst_cache_reset(&tunnel->dst_cache);
1186}
1187EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1188
1189/* Do least required initialization, rest of init is done in tunnel_init call */
1190void ip_tunnel_setup(struct net_device *dev, int net_id)
1191{
1192        struct ip_tunnel *tunnel = netdev_priv(dev);
1193        tunnel->ip_tnl_net_id = net_id;
1194}
1195EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1196
1197MODULE_LICENSE("GPL");
1198