linux/net/ipv4/ip_tunnel.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2013 Nicira, Inc.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of version 2 of the GNU General Public
   6 * License as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful, but
   9 * WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public License
  14 * along with this program; if not, write to the Free Software
  15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16 * 02110-1301, USA
  17 */
  18
  19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  20
  21#include <linux/capability.h>
  22#include <linux/module.h>
  23#include <linux/types.h>
  24#include <linux/kernel.h>
  25#include <linux/slab.h>
  26#include <linux/uaccess.h>
  27#include <linux/skbuff.h>
  28#include <linux/netdevice.h>
  29#include <linux/in.h>
  30#include <linux/tcp.h>
  31#include <linux/udp.h>
  32#include <linux/if_arp.h>
  33#include <linux/mroute.h>
  34#include <linux/init.h>
  35#include <linux/in6.h>
  36#include <linux/inetdevice.h>
  37#include <linux/igmp.h>
  38#include <linux/netfilter_ipv4.h>
  39#include <linux/etherdevice.h>
  40#include <linux/if_ether.h>
  41#include <linux/if_vlan.h>
  42#include <linux/rculist.h>
  43#include <linux/err.h>
  44
  45#include <net/sock.h>
  46#include <net/ip.h>
  47#include <net/icmp.h>
  48#include <net/protocol.h>
  49#include <net/ip_tunnels.h>
  50#include <net/arp.h>
  51#include <net/checksum.h>
  52#include <net/dsfield.h>
  53#include <net/inet_ecn.h>
  54#include <net/xfrm.h>
  55#include <net/net_namespace.h>
  56#include <net/netns/generic.h>
  57#include <net/rtnetlink.h>
  58#include <net/udp.h>
  59
  60#if IS_ENABLED(CONFIG_IPV6)
  61#include <net/ipv6.h>
  62#include <net/ip6_fib.h>
  63#include <net/ip6_route.h>
  64#endif
  65
  66static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
  67{
  68        return hash_32((__force u32)key ^ (__force u32)remote,
  69                         IP_TNL_HASH_BITS);
  70}
  71
  72static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
  73                             struct dst_entry *dst, __be32 saddr)
  74{
  75        struct dst_entry *old_dst;
  76
  77        dst_clone(dst);
  78        old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
  79        dst_release(old_dst);
  80        idst->saddr = saddr;
  81}
  82
  83static noinline void tunnel_dst_set(struct ip_tunnel *t,
  84                           struct dst_entry *dst, __be32 saddr)
  85{
  86        __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
  87}
  88
  89static void tunnel_dst_reset(struct ip_tunnel *t)
  90{
  91        tunnel_dst_set(t, NULL, 0);
  92}
  93
  94void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
  95{
  96        int i;
  97
  98        for_each_possible_cpu(i)
  99                __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
 100}
 101EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
 102
 103static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
 104                                        u32 cookie, __be32 *saddr)
 105{
 106        struct ip_tunnel_dst *idst;
 107        struct dst_entry *dst;
 108
 109        rcu_read_lock();
 110        idst = raw_cpu_ptr(t->dst_cache);
 111        dst = rcu_dereference(idst->dst);
 112        if (dst && !atomic_inc_not_zero(&dst->__refcnt))
 113                dst = NULL;
 114        if (dst) {
 115                if (!dst->obsolete || dst->ops->check(dst, cookie)) {
 116                        *saddr = idst->saddr;
 117                } else {
 118                        tunnel_dst_reset(t);
 119                        dst_release(dst);
 120                        dst = NULL;
 121                }
 122        }
 123        rcu_read_unlock();
 124        return (struct rtable *)dst;
 125}
 126
 127static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
 128                                __be16 flags, __be32 key)
 129{
 130        if (p->i_flags & TUNNEL_KEY) {
 131                if (flags & TUNNEL_KEY)
 132                        return key == p->i_key;
 133                else
 134                        /* key expected, none present */
 135                        return false;
 136        } else
 137                return !(flags & TUNNEL_KEY);
 138}
 139
 140/* Fallback tunnel: no source, no destination, no key, no options
 141
 142   Tunnel hash table:
 143   We require exact key match i.e. if a key is present in packet
 144   it will match only tunnel with the same key; if it is not present,
 145   it will match only keyless tunnel.
 146
 147   All keysless packets, if not matched configured keyless tunnels
 148   will match fallback tunnel.
 149   Given src, dst and key, find appropriate for input tunnel.
 150*/
 151struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
 152                                   int link, __be16 flags,
 153                                   __be32 remote, __be32 local,
 154                                   __be32 key)
 155{
 156        unsigned int hash;
 157        struct ip_tunnel *t, *cand = NULL;
 158        struct hlist_head *head;
 159
 160        hash = ip_tunnel_hash(key, remote);
 161        head = &itn->tunnels[hash];
 162
 163        hlist_for_each_entry_rcu(t, head, hash_node) {
 164                if (local != t->parms.iph.saddr ||
 165                    remote != t->parms.iph.daddr ||
 166                    !(t->dev->flags & IFF_UP))
 167                        continue;
 168
 169                if (!ip_tunnel_key_match(&t->parms, flags, key))
 170                        continue;
 171
 172                if (t->parms.link == link)
 173                        return t;
 174                else
 175                        cand = t;
 176        }
 177
 178        hlist_for_each_entry_rcu(t, head, hash_node) {
 179                if (remote != t->parms.iph.daddr ||
 180                    t->parms.iph.saddr != 0 ||
 181                    !(t->dev->flags & IFF_UP))
 182                        continue;
 183
 184                if (!ip_tunnel_key_match(&t->parms, flags, key))
 185                        continue;
 186
 187                if (t->parms.link == link)
 188                        return t;
 189                else if (!cand)
 190                        cand = t;
 191        }
 192
 193        hash = ip_tunnel_hash(key, 0);
 194        head = &itn->tunnels[hash];
 195
 196        hlist_for_each_entry_rcu(t, head, hash_node) {
 197                if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
 198                    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
 199                        continue;
 200
 201                if (!(t->dev->flags & IFF_UP))
 202                        continue;
 203
 204                if (!ip_tunnel_key_match(&t->parms, flags, key))
 205                        continue;
 206
 207                if (t->parms.link == link)
 208                        return t;
 209                else if (!cand)
 210                        cand = t;
 211        }
 212
 213        if (flags & TUNNEL_NO_KEY)
 214                goto skip_key_lookup;
 215
 216        hlist_for_each_entry_rcu(t, head, hash_node) {
 217                if (t->parms.i_key != key ||
 218                    t->parms.iph.saddr != 0 ||
 219                    t->parms.iph.daddr != 0 ||
 220                    !(t->dev->flags & IFF_UP))
 221                        continue;
 222
 223                if (t->parms.link == link)
 224                        return t;
 225                else if (!cand)
 226                        cand = t;
 227        }
 228
 229skip_key_lookup:
 230        if (cand)
 231                return cand;
 232
 233        t = rcu_dereference(itn->collect_md_tun);
 234        if (t)
 235                return t;
 236
 237        if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
 238                return netdev_priv(itn->fb_tunnel_dev);
 239
 240        return NULL;
 241}
 242EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
 243
 244static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
 245                                    struct ip_tunnel_parm *parms)
 246{
 247        unsigned int h;
 248        __be32 remote;
 249        __be32 i_key = parms->i_key;
 250
 251        if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
 252                remote = parms->iph.daddr;
 253        else
 254                remote = 0;
 255
 256        if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
 257                i_key = 0;
 258
 259        h = ip_tunnel_hash(i_key, remote);
 260        return &itn->tunnels[h];
 261}
 262
 263static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 264{
 265        struct hlist_head *head = ip_bucket(itn, &t->parms);
 266
 267        if (t->collect_md)
 268                rcu_assign_pointer(itn->collect_md_tun, t);
 269        hlist_add_head_rcu(&t->hash_node, head);
 270}
 271
 272static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 273{
 274        if (t->collect_md)
 275                rcu_assign_pointer(itn->collect_md_tun, NULL);
 276        hlist_del_init_rcu(&t->hash_node);
 277}
 278
 279static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
 280                                        struct ip_tunnel_parm *parms,
 281                                        int type)
 282{
 283        __be32 remote = parms->iph.daddr;
 284        __be32 local = parms->iph.saddr;
 285        __be32 key = parms->i_key;
 286        __be16 flags = parms->i_flags;
 287        int link = parms->link;
 288        struct ip_tunnel *t = NULL;
 289        struct hlist_head *head = ip_bucket(itn, parms);
 290
 291        hlist_for_each_entry_rcu(t, head, hash_node) {
 292                if (local == t->parms.iph.saddr &&
 293                    remote == t->parms.iph.daddr &&
 294                    link == t->parms.link &&
 295                    type == t->dev->type &&
 296                    ip_tunnel_key_match(&t->parms, flags, key))
 297                        break;
 298        }
 299        return t;
 300}
 301
 302static struct net_device *__ip_tunnel_create(struct net *net,
 303                                             const struct rtnl_link_ops *ops,
 304                                             struct ip_tunnel_parm *parms)
 305{
 306        int err;
 307        struct ip_tunnel *tunnel;
 308        struct net_device *dev;
 309        char name[IFNAMSIZ];
 310
 311        if (parms->name[0])
 312                strlcpy(name, parms->name, IFNAMSIZ);
 313        else {
 314                if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
 315                        err = -E2BIG;
 316                        goto failed;
 317                }
 318                strlcpy(name, ops->kind, IFNAMSIZ);
 319                strncat(name, "%d", 2);
 320        }
 321
 322        ASSERT_RTNL();
 323        dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
 324        if (!dev) {
 325                err = -ENOMEM;
 326                goto failed;
 327        }
 328        dev_net_set(dev, net);
 329
 330        dev->rtnl_link_ops = ops;
 331
 332        tunnel = netdev_priv(dev);
 333        tunnel->parms = *parms;
 334        tunnel->net = net;
 335
 336        err = register_netdevice(dev);
 337        if (err)
 338                goto failed_free;
 339
 340        return dev;
 341
 342failed_free:
 343        free_netdev(dev);
 344failed:
 345        return ERR_PTR(err);
 346}
 347
 348static inline void init_tunnel_flow(struct flowi4 *fl4,
 349                                    int proto,
 350                                    __be32 daddr, __be32 saddr,
 351                                    __be32 key, __u8 tos, int oif)
 352{
 353        memset(fl4, 0, sizeof(*fl4));
 354        fl4->flowi4_oif = oif;
 355        fl4->daddr = daddr;
 356        fl4->saddr = saddr;
 357        fl4->flowi4_tos = tos;
 358        fl4->flowi4_proto = proto;
 359        fl4->fl4_gre_key = key;
 360}
 361
 362static int ip_tunnel_bind_dev(struct net_device *dev)
 363{
 364        struct net_device *tdev = NULL;
 365        struct ip_tunnel *tunnel = netdev_priv(dev);
 366        const struct iphdr *iph;
 367        int hlen = LL_MAX_HEADER;
 368        int mtu = ETH_DATA_LEN;
 369        int t_hlen = tunnel->hlen + sizeof(struct iphdr);
 370
 371        iph = &tunnel->parms.iph;
 372
 373        /* Guess output device to choose reasonable mtu and needed_headroom */
 374        if (iph->daddr) {
 375                struct flowi4 fl4;
 376                struct rtable *rt;
 377
 378                init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
 379                                 iph->saddr, tunnel->parms.o_key,
 380                                 RT_TOS(iph->tos), tunnel->parms.link);
 381                rt = ip_route_output_key(tunnel->net, &fl4);
 382
 383                if (!IS_ERR(rt)) {
 384                        tdev = rt->dst.dev;
 385                        tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
 386                        ip_rt_put(rt);
 387                }
 388                if (dev->type != ARPHRD_ETHER)
 389                        dev->flags |= IFF_POINTOPOINT;
 390        }
 391
 392        if (!tdev && tunnel->parms.link)
 393                tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
 394
 395        if (tdev) {
 396                hlen = tdev->hard_header_len + tdev->needed_headroom;
 397                mtu = tdev->mtu;
 398        }
 399
 400        dev->needed_headroom = t_hlen + hlen;
 401        mtu -= (dev->hard_header_len + t_hlen);
 402
 403        if (mtu < 68)
 404                mtu = 68;
 405
 406        return mtu;
 407}
 408
 409static struct ip_tunnel *ip_tunnel_create(struct net *net,
 410                                          struct ip_tunnel_net *itn,
 411                                          struct ip_tunnel_parm *parms)
 412{
 413        struct ip_tunnel *nt;
 414        struct net_device *dev;
 415
 416        BUG_ON(!itn->fb_tunnel_dev);
 417        dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
 418        if (IS_ERR(dev))
 419                return ERR_CAST(dev);
 420
 421        dev->mtu = ip_tunnel_bind_dev(dev);
 422
 423        nt = netdev_priv(dev);
 424        ip_tunnel_add(itn, nt);
 425        return nt;
 426}
 427
 428int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 429                  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
 430                  bool log_ecn_error)
 431{
 432        struct pcpu_sw_netstats *tstats;
 433        const struct iphdr *iph = ip_hdr(skb);
 434        int err;
 435
 436#ifdef CONFIG_NET_IPGRE_BROADCAST
 437        if (ipv4_is_multicast(iph->daddr)) {
 438                tunnel->dev->stats.multicast++;
 439                skb->pkt_type = PACKET_BROADCAST;
 440        }
 441#endif
 442
 443        if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
 444             ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
 445                tunnel->dev->stats.rx_crc_errors++;
 446                tunnel->dev->stats.rx_errors++;
 447                goto drop;
 448        }
 449
 450        if (tunnel->parms.i_flags&TUNNEL_SEQ) {
 451                if (!(tpi->flags&TUNNEL_SEQ) ||
 452                    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
 453                        tunnel->dev->stats.rx_fifo_errors++;
 454                        tunnel->dev->stats.rx_errors++;
 455                        goto drop;
 456                }
 457                tunnel->i_seqno = ntohl(tpi->seq) + 1;
 458        }
 459
 460        skb_reset_network_header(skb);
 461
 462        err = IP_ECN_decapsulate(iph, skb);
 463        if (unlikely(err)) {
 464                if (log_ecn_error)
 465                        net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
 466                                        &iph->saddr, iph->tos);
 467                if (err > 1) {
 468                        ++tunnel->dev->stats.rx_frame_errors;
 469                        ++tunnel->dev->stats.rx_errors;
 470                        goto drop;
 471                }
 472        }
 473
 474        tstats = this_cpu_ptr(tunnel->dev->tstats);
 475        u64_stats_update_begin(&tstats->syncp);
 476        tstats->rx_packets++;
 477        tstats->rx_bytes += skb->len;
 478        u64_stats_update_end(&tstats->syncp);
 479
 480        skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
 481
 482        if (tunnel->dev->type == ARPHRD_ETHER) {
 483                skb->protocol = eth_type_trans(skb, tunnel->dev);
 484                skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 485        } else {
 486                skb->dev = tunnel->dev;
 487        }
 488
 489        if (tun_dst)
 490                skb_dst_set(skb, (struct dst_entry *)tun_dst);
 491
 492        gro_cells_receive(&tunnel->gro_cells, skb);
 493        return 0;
 494
 495drop:
 496        kfree_skb(skb);
 497        return 0;
 498}
 499EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
 500
 501static int ip_encap_hlen(struct ip_tunnel_encap *e)
 502{
 503        const struct ip_tunnel_encap_ops *ops;
 504        int hlen = -EINVAL;
 505
 506        if (e->type == TUNNEL_ENCAP_NONE)
 507                return 0;
 508
 509        if (e->type >= MAX_IPTUN_ENCAP_OPS)
 510                return -EINVAL;
 511
 512        rcu_read_lock();
 513        ops = rcu_dereference(iptun_encaps[e->type]);
 514        if (likely(ops && ops->encap_hlen))
 515                hlen = ops->encap_hlen(e);
 516        rcu_read_unlock();
 517
 518        return hlen;
 519}
 520
 521const struct ip_tunnel_encap_ops __rcu *
 522                iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
 523
 524int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
 525                            unsigned int num)
 526{
 527        if (num >= MAX_IPTUN_ENCAP_OPS)
 528                return -ERANGE;
 529
 530        return !cmpxchg((const struct ip_tunnel_encap_ops **)
 531                        &iptun_encaps[num],
 532                        NULL, ops) ? 0 : -1;
 533}
 534EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
 535
 536int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
 537                            unsigned int num)
 538{
 539        int ret;
 540
 541        if (num >= MAX_IPTUN_ENCAP_OPS)
 542                return -ERANGE;
 543
 544        ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
 545                       &iptun_encaps[num],
 546                       ops, NULL) == ops) ? 0 : -1;
 547
 548        synchronize_net();
 549
 550        return ret;
 551}
 552EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
 553
 554int ip_tunnel_encap_setup(struct ip_tunnel *t,
 555                          struct ip_tunnel_encap *ipencap)
 556{
 557        int hlen;
 558
 559        memset(&t->encap, 0, sizeof(t->encap));
 560
 561        hlen = ip_encap_hlen(ipencap);
 562        if (hlen < 0)
 563                return hlen;
 564
 565        t->encap.type = ipencap->type;
 566        t->encap.sport = ipencap->sport;
 567        t->encap.dport = ipencap->dport;
 568        t->encap.flags = ipencap->flags;
 569
 570        t->encap_hlen = hlen;
 571        t->hlen = t->encap_hlen + t->tun_hlen;
 572
 573        return 0;
 574}
 575EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
 576
 577int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
 578                    u8 *protocol, struct flowi4 *fl4)
 579{
 580        const struct ip_tunnel_encap_ops *ops;
 581        int ret = -EINVAL;
 582
 583        if (t->encap.type == TUNNEL_ENCAP_NONE)
 584                return 0;
 585
 586        if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
 587                return -EINVAL;
 588
 589        rcu_read_lock();
 590        ops = rcu_dereference(iptun_encaps[t->encap.type]);
 591        if (likely(ops && ops->build_header))
 592                ret = ops->build_header(skb, &t->encap, protocol, fl4);
 593        rcu_read_unlock();
 594
 595        return ret;
 596}
 597EXPORT_SYMBOL(ip_tunnel_encap);
 598
 599static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
 600                            struct rtable *rt, __be16 df,
 601                            const struct iphdr *inner_iph)
 602{
 603        struct ip_tunnel *tunnel = netdev_priv(dev);
 604        int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
 605        int mtu;
 606
 607        if (df)
 608                mtu = dst_mtu(&rt->dst) - dev->hard_header_len
 609                                        - sizeof(struct iphdr) - tunnel->hlen;
 610        else
 611                mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 612
 613        if (skb_dst(skb))
 614                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 615
 616        if (skb->protocol == htons(ETH_P_IP)) {
 617                if (!skb_is_gso(skb) &&
 618                    (inner_iph->frag_off & htons(IP_DF)) &&
 619                    mtu < pkt_size) {
 620                        memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
 621                        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 622                        return -E2BIG;
 623                }
 624        }
 625#if IS_ENABLED(CONFIG_IPV6)
 626        else if (skb->protocol == htons(ETH_P_IPV6)) {
 627                struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
 628
 629                if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
 630                           mtu >= IPV6_MIN_MTU) {
 631                        if ((tunnel->parms.iph.daddr &&
 632                            !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
 633                            rt6->rt6i_dst.plen == 128) {
 634                                rt6->rt6i_flags |= RTF_MODIFIED;
 635                                dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
 636                        }
 637                }
 638
 639                if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
 640                                        mtu < pkt_size) {
 641                        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 642                        return -E2BIG;
 643                }
 644        }
 645#endif
 646        return 0;
 647}
 648
 649void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 650                    const struct iphdr *tnl_params, u8 protocol)
 651{
 652        struct ip_tunnel *tunnel = netdev_priv(dev);
 653        const struct iphdr *inner_iph;
 654        struct flowi4 fl4;
 655        u8     tos, ttl;
 656        __be16 df;
 657        struct rtable *rt;              /* Route to the other host */
 658        unsigned int max_headroom;      /* The extra header space needed */
 659        __be32 dst;
 660        int err;
 661        bool connected;
 662
 663        inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
 664        connected = (tunnel->parms.iph.daddr != 0);
 665
 666        dst = tnl_params->daddr;
 667        if (dst == 0) {
 668                /* NBMA tunnel */
 669
 670                if (!skb_dst(skb)) {
 671                        dev->stats.tx_fifo_errors++;
 672                        goto tx_error;
 673                }
 674
 675                if (skb->protocol == htons(ETH_P_IP)) {
 676                        rt = skb_rtable(skb);
 677                        dst = rt_nexthop(rt, inner_iph->daddr);
 678                }
 679#if IS_ENABLED(CONFIG_IPV6)
 680                else if (skb->protocol == htons(ETH_P_IPV6)) {
 681                        const struct in6_addr *addr6;
 682                        struct neighbour *neigh;
 683                        bool do_tx_error_icmp;
 684                        int addr_type;
 685
 686                        neigh = dst_neigh_lookup(skb_dst(skb),
 687                                                 &ipv6_hdr(skb)->daddr);
 688                        if (!neigh)
 689                                goto tx_error;
 690
 691                        addr6 = (const struct in6_addr *)&neigh->primary_key;
 692                        addr_type = ipv6_addr_type(addr6);
 693
 694                        if (addr_type == IPV6_ADDR_ANY) {
 695                                addr6 = &ipv6_hdr(skb)->daddr;
 696                                addr_type = ipv6_addr_type(addr6);
 697                        }
 698
 699                        if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
 700                                do_tx_error_icmp = true;
 701                        else {
 702                                do_tx_error_icmp = false;
 703                                dst = addr6->s6_addr32[3];
 704                        }
 705                        neigh_release(neigh);
 706                        if (do_tx_error_icmp)
 707                                goto tx_error_icmp;
 708                }
 709#endif
 710                else
 711                        goto tx_error;
 712
 713                connected = false;
 714        }
 715
 716        tos = tnl_params->tos;
 717        if (tos & 0x1) {
 718                tos &= ~0x1;
 719                if (skb->protocol == htons(ETH_P_IP)) {
 720                        tos = inner_iph->tos;
 721                        connected = false;
 722                } else if (skb->protocol == htons(ETH_P_IPV6)) {
 723                        tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
 724                        connected = false;
 725                }
 726        }
 727
 728        init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
 729                         tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
 730
 731        if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
 732                goto tx_error;
 733
 734        rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
 735
 736        if (!rt) {
 737                rt = ip_route_output_key(tunnel->net, &fl4);
 738
 739                if (IS_ERR(rt)) {
 740                        dev->stats.tx_carrier_errors++;
 741                        goto tx_error;
 742                }
 743                if (connected)
 744                        tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
 745        }
 746
 747        if (rt->dst.dev == dev) {
 748                ip_rt_put(rt);
 749                dev->stats.collisions++;
 750                goto tx_error;
 751        }
 752
 753        if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
 754                ip_rt_put(rt);
 755                goto tx_error;
 756        }
 757
 758        if (tunnel->err_count > 0) {
 759                if (time_before(jiffies,
 760                                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
 761                        tunnel->err_count--;
 762
 763                        memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
 764                        dst_link_failure(skb);
 765                } else
 766                        tunnel->err_count = 0;
 767        }
 768
 769        tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
 770        ttl = tnl_params->ttl;
 771        if (ttl == 0) {
 772                if (skb->protocol == htons(ETH_P_IP))
 773                        ttl = inner_iph->ttl;
 774#if IS_ENABLED(CONFIG_IPV6)
 775                else if (skb->protocol == htons(ETH_P_IPV6))
 776                        ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
 777#endif
 778                else
 779                        ttl = ip4_dst_hoplimit(&rt->dst);
 780        }
 781
 782        df = tnl_params->frag_off;
 783        if (skb->protocol == htons(ETH_P_IP))
 784                df |= (inner_iph->frag_off&htons(IP_DF));
 785
 786        max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
 787                        + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
 788        if (max_headroom > dev->needed_headroom)
 789                dev->needed_headroom = max_headroom;
 790
 791        if (skb_cow_head(skb, dev->needed_headroom)) {
 792                ip_rt_put(rt);
 793                dev->stats.tx_dropped++;
 794                kfree_skb(skb);
 795                return;
 796        }
 797
 798        err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol,
 799                            tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
 800        iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
 801
 802        return;
 803
 804#if IS_ENABLED(CONFIG_IPV6)
 805tx_error_icmp:
 806        dst_link_failure(skb);
 807#endif
 808tx_error:
 809        dev->stats.tx_errors++;
 810        kfree_skb(skb);
 811}
 812EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
 813
 814static void ip_tunnel_update(struct ip_tunnel_net *itn,
 815                             struct ip_tunnel *t,
 816                             struct net_device *dev,
 817                             struct ip_tunnel_parm *p,
 818                             bool set_mtu)
 819{
 820        ip_tunnel_del(itn, t);
 821        t->parms.iph.saddr = p->iph.saddr;
 822        t->parms.iph.daddr = p->iph.daddr;
 823        t->parms.i_key = p->i_key;
 824        t->parms.o_key = p->o_key;
 825        if (dev->type != ARPHRD_ETHER) {
 826                memcpy(dev->dev_addr, &p->iph.saddr, 4);
 827                memcpy(dev->broadcast, &p->iph.daddr, 4);
 828        }
 829        ip_tunnel_add(itn, t);
 830
 831        t->parms.iph.ttl = p->iph.ttl;
 832        t->parms.iph.tos = p->iph.tos;
 833        t->parms.iph.frag_off = p->iph.frag_off;
 834
 835        if (t->parms.link != p->link) {
 836                int mtu;
 837
 838                t->parms.link = p->link;
 839                mtu = ip_tunnel_bind_dev(dev);
 840                if (set_mtu)
 841                        dev->mtu = mtu;
 842        }
 843        ip_tunnel_dst_reset_all(t);
 844        netdev_state_change(dev);
 845}
 846
 847int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
 848{
 849        int err = 0;
 850        struct ip_tunnel *t = netdev_priv(dev);
 851        struct net *net = t->net;
 852        struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
 853
 854        BUG_ON(!itn->fb_tunnel_dev);
 855        switch (cmd) {
 856        case SIOCGETTUNNEL:
 857                if (dev == itn->fb_tunnel_dev) {
 858                        t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 859                        if (!t)
 860                                t = netdev_priv(dev);
 861                }
 862                memcpy(p, &t->parms, sizeof(*p));
 863                break;
 864
 865        case SIOCADDTUNNEL:
 866        case SIOCCHGTUNNEL:
 867                err = -EPERM;
 868                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 869                        goto done;
 870                if (p->iph.ttl)
 871                        p->iph.frag_off |= htons(IP_DF);
 872                if (!(p->i_flags & VTI_ISVTI)) {
 873                        if (!(p->i_flags & TUNNEL_KEY))
 874                                p->i_key = 0;
 875                        if (!(p->o_flags & TUNNEL_KEY))
 876                                p->o_key = 0;
 877                }
 878
 879                t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 880
 881                if (cmd == SIOCADDTUNNEL) {
 882                        if (!t) {
 883                                t = ip_tunnel_create(net, itn, p);
 884                                err = PTR_ERR_OR_ZERO(t);
 885                                break;
 886                        }
 887
 888                        err = -EEXIST;
 889                        break;
 890                }
 891                if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
 892                        if (t) {
 893                                if (t->dev != dev) {
 894                                        err = -EEXIST;
 895                                        break;
 896                                }
 897                        } else {
 898                                unsigned int nflags = 0;
 899
 900                                if (ipv4_is_multicast(p->iph.daddr))
 901                                        nflags = IFF_BROADCAST;
 902                                else if (p->iph.daddr)
 903                                        nflags = IFF_POINTOPOINT;
 904
 905                                if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
 906                                        err = -EINVAL;
 907                                        break;
 908                                }
 909
 910                                t = netdev_priv(dev);
 911                        }
 912                }
 913
 914                if (t) {
 915                        err = 0;
 916                        ip_tunnel_update(itn, t, dev, p, true);
 917                } else {
 918                        err = -ENOENT;
 919                }
 920                break;
 921
 922        case SIOCDELTUNNEL:
 923                err = -EPERM;
 924                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 925                        goto done;
 926
 927                if (dev == itn->fb_tunnel_dev) {
 928                        err = -ENOENT;
 929                        t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 930                        if (!t)
 931                                goto done;
 932                        err = -EPERM;
 933                        if (t == netdev_priv(itn->fb_tunnel_dev))
 934                                goto done;
 935                        dev = t->dev;
 936                }
 937                unregister_netdevice(dev);
 938                err = 0;
 939                break;
 940
 941        default:
 942                err = -EINVAL;
 943        }
 944
 945done:
 946        return err;
 947}
 948EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
 949
 950int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
 951{
 952        struct ip_tunnel *tunnel = netdev_priv(dev);
 953        int t_hlen = tunnel->hlen + sizeof(struct iphdr);
 954
 955        if (new_mtu < 68 ||
 956            new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
 957                return -EINVAL;
 958        dev->mtu = new_mtu;
 959        return 0;
 960}
 961EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
 962
 963static void ip_tunnel_dev_free(struct net_device *dev)
 964{
 965        struct ip_tunnel *tunnel = netdev_priv(dev);
 966
 967        gro_cells_destroy(&tunnel->gro_cells);
 968        free_percpu(tunnel->dst_cache);
 969        free_percpu(dev->tstats);
 970        free_netdev(dev);
 971}
 972
 973void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
 974{
 975        struct ip_tunnel *tunnel = netdev_priv(dev);
 976        struct ip_tunnel_net *itn;
 977
 978        itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
 979
 980        if (itn->fb_tunnel_dev != dev) {
 981                ip_tunnel_del(itn, netdev_priv(dev));
 982                unregister_netdevice_queue(dev, head);
 983        }
 984}
 985EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
 986
 987struct net *ip_tunnel_get_link_net(const struct net_device *dev)
 988{
 989        struct ip_tunnel *tunnel = netdev_priv(dev);
 990
 991        return tunnel->net;
 992}
 993EXPORT_SYMBOL(ip_tunnel_get_link_net);
 994
 995int ip_tunnel_get_iflink(const struct net_device *dev)
 996{
 997        struct ip_tunnel *tunnel = netdev_priv(dev);
 998
 999        return tunnel->parms.link;
1000}
1001EXPORT_SYMBOL(ip_tunnel_get_iflink);
1002
1003int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
1004                                  struct rtnl_link_ops *ops, char *devname)
1005{
1006        struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1007        struct ip_tunnel_parm parms;
1008        unsigned int i;
1009
1010        for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1011                INIT_HLIST_HEAD(&itn->tunnels[i]);
1012
1013        if (!ops) {
1014                itn->fb_tunnel_dev = NULL;
1015                return 0;
1016        }
1017
1018        memset(&parms, 0, sizeof(parms));
1019        if (devname)
1020                strlcpy(parms.name, devname, IFNAMSIZ);
1021
1022        rtnl_lock();
1023        itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1024        /* FB netdevice is special: we have one, and only one per netns.
1025         * Allowing to move it to another netns is clearly unsafe.
1026         */
1027        if (!IS_ERR(itn->fb_tunnel_dev)) {
1028                itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1029                itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1030                ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1031        }
1032        rtnl_unlock();
1033
1034        return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1035}
1036EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1037
1038static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1039                              struct rtnl_link_ops *ops)
1040{
1041        struct net *net = dev_net(itn->fb_tunnel_dev);
1042        struct net_device *dev, *aux;
1043        int h;
1044
1045        for_each_netdev_safe(net, dev, aux)
1046                if (dev->rtnl_link_ops == ops)
1047                        unregister_netdevice_queue(dev, head);
1048
1049        for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1050                struct ip_tunnel *t;
1051                struct hlist_node *n;
1052                struct hlist_head *thead = &itn->tunnels[h];
1053
1054                hlist_for_each_entry_safe(t, n, thead, hash_node)
1055                        /* If dev is in the same netns, it has already
1056                         * been added to the list by the previous loop.
1057                         */
1058                        if (!net_eq(dev_net(t->dev), net))
1059                                unregister_netdevice_queue(t->dev, head);
1060        }
1061}
1062
1063void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1064{
1065        LIST_HEAD(list);
1066
1067        rtnl_lock();
1068        ip_tunnel_destroy(itn, &list, ops);
1069        unregister_netdevice_many(&list);
1070        rtnl_unlock();
1071}
1072EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1073
1074int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1075                      struct ip_tunnel_parm *p)
1076{
1077        struct ip_tunnel *nt;
1078        struct net *net = dev_net(dev);
1079        struct ip_tunnel_net *itn;
1080        int mtu;
1081        int err;
1082
1083        nt = netdev_priv(dev);
1084        itn = net_generic(net, nt->ip_tnl_net_id);
1085
1086        if (nt->collect_md) {
1087                if (rtnl_dereference(itn->collect_md_tun))
1088                        return -EEXIST;
1089        } else {
1090                if (ip_tunnel_find(itn, p, dev->type))
1091                        return -EEXIST;
1092        }
1093
1094        nt->net = net;
1095        nt->parms = *p;
1096        err = register_netdevice(dev);
1097        if (err)
1098                goto out;
1099
1100        if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1101                eth_hw_addr_random(dev);
1102
1103        mtu = ip_tunnel_bind_dev(dev);
1104        if (!tb[IFLA_MTU])
1105                dev->mtu = mtu;
1106
1107        ip_tunnel_add(itn, nt);
1108out:
1109        return err;
1110}
1111EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1112
1113int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1114                         struct ip_tunnel_parm *p)
1115{
1116        struct ip_tunnel *t;
1117        struct ip_tunnel *tunnel = netdev_priv(dev);
1118        struct net *net = tunnel->net;
1119        struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1120
1121        if (dev == itn->fb_tunnel_dev)
1122                return -EINVAL;
1123
1124        t = ip_tunnel_find(itn, p, dev->type);
1125
1126        if (t) {
1127                if (t->dev != dev)
1128                        return -EEXIST;
1129        } else {
1130                t = tunnel;
1131
1132                if (dev->type != ARPHRD_ETHER) {
1133                        unsigned int nflags = 0;
1134
1135                        if (ipv4_is_multicast(p->iph.daddr))
1136                                nflags = IFF_BROADCAST;
1137                        else if (p->iph.daddr)
1138                                nflags = IFF_POINTOPOINT;
1139
1140                        if ((dev->flags ^ nflags) &
1141                            (IFF_POINTOPOINT | IFF_BROADCAST))
1142                                return -EINVAL;
1143                }
1144        }
1145
1146        ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1147        return 0;
1148}
1149EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1150
1151int ip_tunnel_init(struct net_device *dev)
1152{
1153        struct ip_tunnel *tunnel = netdev_priv(dev);
1154        struct iphdr *iph = &tunnel->parms.iph;
1155        int err;
1156
1157        dev->destructor = ip_tunnel_dev_free;
1158        dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1159        if (!dev->tstats)
1160                return -ENOMEM;
1161
1162        tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1163        if (!tunnel->dst_cache) {
1164                free_percpu(dev->tstats);
1165                return -ENOMEM;
1166        }
1167
1168        err = gro_cells_init(&tunnel->gro_cells, dev);
1169        if (err) {
1170                free_percpu(tunnel->dst_cache);
1171                free_percpu(dev->tstats);
1172                return err;
1173        }
1174
1175        tunnel->dev = dev;
1176        tunnel->net = dev_net(dev);
1177        strcpy(tunnel->parms.name, dev->name);
1178        iph->version            = 4;
1179        iph->ihl                = 5;
1180
1181        if (tunnel->collect_md) {
1182                dev->features |= NETIF_F_NETNS_LOCAL;
1183                netif_keep_dst(dev);
1184        }
1185        return 0;
1186}
1187EXPORT_SYMBOL_GPL(ip_tunnel_init);
1188
1189void ip_tunnel_uninit(struct net_device *dev)
1190{
1191        struct ip_tunnel *tunnel = netdev_priv(dev);
1192        struct net *net = tunnel->net;
1193        struct ip_tunnel_net *itn;
1194
1195        itn = net_generic(net, tunnel->ip_tnl_net_id);
1196        /* fb_tunnel_dev will be unregisted in net-exit call. */
1197        if (itn->fb_tunnel_dev != dev)
1198                ip_tunnel_del(itn, netdev_priv(dev));
1199
1200        ip_tunnel_dst_reset_all(tunnel);
1201}
1202EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1203
1204/* Do least required initialization, rest of init is done in tunnel_init call */
1205void ip_tunnel_setup(struct net_device *dev, int net_id)
1206{
1207        struct ip_tunnel *tunnel = netdev_priv(dev);
1208        tunnel->ip_tnl_net_id = net_id;
1209}
1210EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1211
1212MODULE_LICENSE("GPL");
1213