linux/net/ipv4/ip_tunnel.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2013 Nicira, Inc.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of version 2 of the GNU General Public
   6 * License as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful, but
   9 * WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public License
  14 * along with this program; if not, write to the Free Software
  15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16 * 02110-1301, USA
  17 */
  18
  19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  20
  21#include <linux/capability.h>
  22#include <linux/module.h>
  23#include <linux/types.h>
  24#include <linux/kernel.h>
  25#include <linux/slab.h>
  26#include <linux/uaccess.h>
  27#include <linux/skbuff.h>
  28#include <linux/netdevice.h>
  29#include <linux/in.h>
  30#include <linux/tcp.h>
  31#include <linux/udp.h>
  32#include <linux/if_arp.h>
  33#include <linux/init.h>
  34#include <linux/in6.h>
  35#include <linux/inetdevice.h>
  36#include <linux/igmp.h>
  37#include <linux/netfilter_ipv4.h>
  38#include <linux/etherdevice.h>
  39#include <linux/if_ether.h>
  40#include <linux/if_vlan.h>
  41#include <linux/rculist.h>
  42#include <linux/err.h>
  43
  44#include <net/sock.h>
  45#include <net/ip.h>
  46#include <net/icmp.h>
  47#include <net/protocol.h>
  48#include <net/ip_tunnels.h>
  49#include <net/arp.h>
  50#include <net/checksum.h>
  51#include <net/dsfield.h>
  52#include <net/inet_ecn.h>
  53#include <net/xfrm.h>
  54#include <net/net_namespace.h>
  55#include <net/netns/generic.h>
  56#include <net/rtnetlink.h>
  57#include <net/udp.h>
  58
  59#if IS_ENABLED(CONFIG_IPV6)
  60#include <net/ipv6.h>
  61#include <net/ip6_fib.h>
  62#include <net/ip6_route.h>
  63#endif
  64
  65static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
  66{
  67        return hash_32((__force u32)key ^ (__force u32)remote,
  68                         IP_TNL_HASH_BITS);
  69}
  70
  71static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
  72                                __be16 flags, __be32 key)
  73{
  74        if (p->i_flags & TUNNEL_KEY) {
  75                if (flags & TUNNEL_KEY)
  76                        return key == p->i_key;
  77                else
  78                        /* key expected, none present */
  79                        return false;
  80        } else
  81                return !(flags & TUNNEL_KEY);
  82}
  83
  84/* Fallback tunnel: no source, no destination, no key, no options
  85
  86   Tunnel hash table:
  87   We require exact key match i.e. if a key is present in packet
  88   it will match only tunnel with the same key; if it is not present,
  89   it will match only keyless tunnel.
  90
  91   All keysless packets, if not matched configured keyless tunnels
  92   will match fallback tunnel.
  93   Given src, dst and key, find appropriate for input tunnel.
  94*/
  95struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
  96                                   int link, __be16 flags,
  97                                   __be32 remote, __be32 local,
  98                                   __be32 key)
  99{
 100        unsigned int hash;
 101        struct ip_tunnel *t, *cand = NULL;
 102        struct hlist_head *head;
 103
 104        hash = ip_tunnel_hash(key, remote);
 105        head = &itn->tunnels[hash];
 106
 107        hlist_for_each_entry_rcu(t, head, hash_node) {
 108                if (local != t->parms.iph.saddr ||
 109                    remote != t->parms.iph.daddr ||
 110                    !(t->dev->flags & IFF_UP))
 111                        continue;
 112
 113                if (!ip_tunnel_key_match(&t->parms, flags, key))
 114                        continue;
 115
 116                if (t->parms.link == link)
 117                        return t;
 118                else
 119                        cand = t;
 120        }
 121
 122        hlist_for_each_entry_rcu(t, head, hash_node) {
 123                if (remote != t->parms.iph.daddr ||
 124                    t->parms.iph.saddr != 0 ||
 125                    !(t->dev->flags & IFF_UP))
 126                        continue;
 127
 128                if (!ip_tunnel_key_match(&t->parms, flags, key))
 129                        continue;
 130
 131                if (t->parms.link == link)
 132                        return t;
 133                else if (!cand)
 134                        cand = t;
 135        }
 136
 137        hash = ip_tunnel_hash(key, 0);
 138        head = &itn->tunnels[hash];
 139
 140        hlist_for_each_entry_rcu(t, head, hash_node) {
 141                if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
 142                    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
 143                        continue;
 144
 145                if (!(t->dev->flags & IFF_UP))
 146                        continue;
 147
 148                if (!ip_tunnel_key_match(&t->parms, flags, key))
 149                        continue;
 150
 151                if (t->parms.link == link)
 152                        return t;
 153                else if (!cand)
 154                        cand = t;
 155        }
 156
 157        if (flags & TUNNEL_NO_KEY)
 158                goto skip_key_lookup;
 159
 160        hlist_for_each_entry_rcu(t, head, hash_node) {
 161                if (t->parms.i_key != key ||
 162                    t->parms.iph.saddr != 0 ||
 163                    t->parms.iph.daddr != 0 ||
 164                    !(t->dev->flags & IFF_UP))
 165                        continue;
 166
 167                if (t->parms.link == link)
 168                        return t;
 169                else if (!cand)
 170                        cand = t;
 171        }
 172
 173skip_key_lookup:
 174        if (cand)
 175                return cand;
 176
 177        t = rcu_dereference(itn->collect_md_tun);
 178        if (t)
 179                return t;
 180
 181        if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
 182                return netdev_priv(itn->fb_tunnel_dev);
 183
 184        return NULL;
 185}
 186EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
 187
 188static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
 189                                    struct ip_tunnel_parm *parms)
 190{
 191        unsigned int h;
 192        __be32 remote;
 193        __be32 i_key = parms->i_key;
 194
 195        if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
 196                remote = parms->iph.daddr;
 197        else
 198                remote = 0;
 199
 200        if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
 201                i_key = 0;
 202
 203        h = ip_tunnel_hash(i_key, remote);
 204        return &itn->tunnels[h];
 205}
 206
 207static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 208{
 209        struct hlist_head *head = ip_bucket(itn, &t->parms);
 210
 211        if (t->collect_md)
 212                rcu_assign_pointer(itn->collect_md_tun, t);
 213        hlist_add_head_rcu(&t->hash_node, head);
 214}
 215
 216static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 217{
 218        if (t->collect_md)
 219                rcu_assign_pointer(itn->collect_md_tun, NULL);
 220        hlist_del_init_rcu(&t->hash_node);
 221}
 222
 223static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
 224                                        struct ip_tunnel_parm *parms,
 225                                        int type)
 226{
 227        __be32 remote = parms->iph.daddr;
 228        __be32 local = parms->iph.saddr;
 229        __be32 key = parms->i_key;
 230        __be16 flags = parms->i_flags;
 231        int link = parms->link;
 232        struct ip_tunnel *t = NULL;
 233        struct hlist_head *head = ip_bucket(itn, parms);
 234
 235        hlist_for_each_entry_rcu(t, head, hash_node) {
 236                if (local == t->parms.iph.saddr &&
 237                    remote == t->parms.iph.daddr &&
 238                    link == t->parms.link &&
 239                    type == t->dev->type &&
 240                    ip_tunnel_key_match(&t->parms, flags, key))
 241                        break;
 242        }
 243        return t;
 244}
 245
 246static struct net_device *__ip_tunnel_create(struct net *net,
 247                                             const struct rtnl_link_ops *ops,
 248                                             struct ip_tunnel_parm *parms)
 249{
 250        int err;
 251        struct ip_tunnel *tunnel;
 252        struct net_device *dev;
 253        char name[IFNAMSIZ];
 254
 255        if (parms->name[0])
 256                strlcpy(name, parms->name, IFNAMSIZ);
 257        else {
 258                if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
 259                        err = -E2BIG;
 260                        goto failed;
 261                }
 262                strlcpy(name, ops->kind, IFNAMSIZ);
 263                strncat(name, "%d", 2);
 264        }
 265
 266        ASSERT_RTNL();
 267        dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
 268        if (!dev) {
 269                err = -ENOMEM;
 270                goto failed;
 271        }
 272        dev_net_set(dev, net);
 273
 274        dev->rtnl_link_ops = ops;
 275
 276        tunnel = netdev_priv(dev);
 277        tunnel->parms = *parms;
 278        tunnel->net = net;
 279
 280        err = register_netdevice(dev);
 281        if (err)
 282                goto failed_free;
 283
 284        return dev;
 285
 286failed_free:
 287        free_netdev(dev);
 288failed:
 289        return ERR_PTR(err);
 290}
 291
 292static inline void init_tunnel_flow(struct flowi4 *fl4,
 293                                    int proto,
 294                                    __be32 daddr, __be32 saddr,
 295                                    __be32 key, __u8 tos, int oif)
 296{
 297        memset(fl4, 0, sizeof(*fl4));
 298        fl4->flowi4_oif = oif;
 299        fl4->daddr = daddr;
 300        fl4->saddr = saddr;
 301        fl4->flowi4_tos = tos;
 302        fl4->flowi4_proto = proto;
 303        fl4->fl4_gre_key = key;
 304}
 305
 306static int ip_tunnel_bind_dev(struct net_device *dev)
 307{
 308        struct net_device *tdev = NULL;
 309        struct ip_tunnel *tunnel = netdev_priv(dev);
 310        const struct iphdr *iph;
 311        int hlen = LL_MAX_HEADER;
 312        int mtu = ETH_DATA_LEN;
 313        int t_hlen = tunnel->hlen + sizeof(struct iphdr);
 314
 315        iph = &tunnel->parms.iph;
 316
 317        /* Guess output device to choose reasonable mtu and needed_headroom */
 318        if (iph->daddr) {
 319                struct flowi4 fl4;
 320                struct rtable *rt;
 321
 322                init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
 323                                 iph->saddr, tunnel->parms.o_key,
 324                                 RT_TOS(iph->tos), tunnel->parms.link);
 325                rt = ip_route_output_key(tunnel->net, &fl4);
 326
 327                if (!IS_ERR(rt)) {
 328                        tdev = rt->dst.dev;
 329                        ip_rt_put(rt);
 330                }
 331                if (dev->type != ARPHRD_ETHER)
 332                        dev->flags |= IFF_POINTOPOINT;
 333
 334                dst_cache_reset(&tunnel->dst_cache);
 335        }
 336
 337        if (!tdev && tunnel->parms.link)
 338                tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
 339
 340        if (tdev) {
 341                hlen = tdev->hard_header_len + tdev->needed_headroom;
 342                mtu = tdev->mtu;
 343        }
 344
 345        dev->needed_headroom = t_hlen + hlen;
 346        mtu -= (dev->hard_header_len + t_hlen);
 347
 348        if (mtu < 68)
 349                mtu = 68;
 350
 351        return mtu;
 352}
 353
 354static struct ip_tunnel *ip_tunnel_create(struct net *net,
 355                                          struct ip_tunnel_net *itn,
 356                                          struct ip_tunnel_parm *parms)
 357{
 358        struct ip_tunnel *nt;
 359        struct net_device *dev;
 360
 361        BUG_ON(!itn->fb_tunnel_dev);
 362        dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
 363        if (IS_ERR(dev))
 364                return ERR_CAST(dev);
 365
 366        dev->mtu = ip_tunnel_bind_dev(dev);
 367
 368        nt = netdev_priv(dev);
 369        ip_tunnel_add(itn, nt);
 370        return nt;
 371}
 372
 373int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 374                  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
 375                  bool log_ecn_error)
 376{
 377        struct pcpu_sw_netstats *tstats;
 378        const struct iphdr *iph = ip_hdr(skb);
 379        int err;
 380
 381#ifdef CONFIG_NET_IPGRE_BROADCAST
 382        if (ipv4_is_multicast(iph->daddr)) {
 383                tunnel->dev->stats.multicast++;
 384                skb->pkt_type = PACKET_BROADCAST;
 385        }
 386#endif
 387
 388        if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
 389             ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
 390                tunnel->dev->stats.rx_crc_errors++;
 391                tunnel->dev->stats.rx_errors++;
 392                goto drop;
 393        }
 394
 395        if (tunnel->parms.i_flags&TUNNEL_SEQ) {
 396                if (!(tpi->flags&TUNNEL_SEQ) ||
 397                    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
 398                        tunnel->dev->stats.rx_fifo_errors++;
 399                        tunnel->dev->stats.rx_errors++;
 400                        goto drop;
 401                }
 402                tunnel->i_seqno = ntohl(tpi->seq) + 1;
 403        }
 404
 405        skb_reset_network_header(skb);
 406
 407        err = IP_ECN_decapsulate(iph, skb);
 408        if (unlikely(err)) {
 409                if (log_ecn_error)
 410                        net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
 411                                        &iph->saddr, iph->tos);
 412                if (err > 1) {
 413                        ++tunnel->dev->stats.rx_frame_errors;
 414                        ++tunnel->dev->stats.rx_errors;
 415                        goto drop;
 416                }
 417        }
 418
 419        tstats = this_cpu_ptr(tunnel->dev->tstats);
 420        u64_stats_update_begin(&tstats->syncp);
 421        tstats->rx_packets++;
 422        tstats->rx_bytes += skb->len;
 423        u64_stats_update_end(&tstats->syncp);
 424
 425        skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
 426
 427        if (tunnel->dev->type == ARPHRD_ETHER) {
 428                skb->protocol = eth_type_trans(skb, tunnel->dev);
 429                skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 430        } else {
 431                skb->dev = tunnel->dev;
 432        }
 433
 434        if (tun_dst)
 435                skb_dst_set(skb, (struct dst_entry *)tun_dst);
 436
 437        gro_cells_receive(&tunnel->gro_cells, skb);
 438        return 0;
 439
 440drop:
 441        kfree_skb(skb);
 442        return 0;
 443}
 444EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
 445
 446static int ip_encap_hlen(struct ip_tunnel_encap *e)
 447{
 448        const struct ip_tunnel_encap_ops *ops;
 449        int hlen = -EINVAL;
 450
 451        if (e->type == TUNNEL_ENCAP_NONE)
 452                return 0;
 453
 454        if (e->type >= MAX_IPTUN_ENCAP_OPS)
 455                return -EINVAL;
 456
 457        rcu_read_lock();
 458        ops = rcu_dereference(iptun_encaps[e->type]);
 459        if (likely(ops && ops->encap_hlen))
 460                hlen = ops->encap_hlen(e);
 461        rcu_read_unlock();
 462
 463        return hlen;
 464}
 465
 466const struct ip_tunnel_encap_ops __rcu *
 467                iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
 468
 469int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
 470                            unsigned int num)
 471{
 472        if (num >= MAX_IPTUN_ENCAP_OPS)
 473                return -ERANGE;
 474
 475        return !cmpxchg((const struct ip_tunnel_encap_ops **)
 476                        &iptun_encaps[num],
 477                        NULL, ops) ? 0 : -1;
 478}
 479EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
 480
 481int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
 482                            unsigned int num)
 483{
 484        int ret;
 485
 486        if (num >= MAX_IPTUN_ENCAP_OPS)
 487                return -ERANGE;
 488
 489        ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
 490                       &iptun_encaps[num],
 491                       ops, NULL) == ops) ? 0 : -1;
 492
 493        synchronize_net();
 494
 495        return ret;
 496}
 497EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
 498
 499int ip_tunnel_encap_setup(struct ip_tunnel *t,
 500                          struct ip_tunnel_encap *ipencap)
 501{
 502        int hlen;
 503
 504        memset(&t->encap, 0, sizeof(t->encap));
 505
 506        hlen = ip_encap_hlen(ipencap);
 507        if (hlen < 0)
 508                return hlen;
 509
 510        t->encap.type = ipencap->type;
 511        t->encap.sport = ipencap->sport;
 512        t->encap.dport = ipencap->dport;
 513        t->encap.flags = ipencap->flags;
 514
 515        t->encap_hlen = hlen;
 516        t->hlen = t->encap_hlen + t->tun_hlen;
 517
 518        return 0;
 519}
 520EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
 521
 522int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
 523                    u8 *protocol, struct flowi4 *fl4)
 524{
 525        const struct ip_tunnel_encap_ops *ops;
 526        int ret = -EINVAL;
 527
 528        if (t->encap.type == TUNNEL_ENCAP_NONE)
 529                return 0;
 530
 531        if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
 532                return -EINVAL;
 533
 534        rcu_read_lock();
 535        ops = rcu_dereference(iptun_encaps[t->encap.type]);
 536        if (likely(ops && ops->build_header))
 537                ret = ops->build_header(skb, &t->encap, protocol, fl4);
 538        rcu_read_unlock();
 539
 540        return ret;
 541}
 542EXPORT_SYMBOL(ip_tunnel_encap);
 543
 544static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
 545                            struct rtable *rt, __be16 df,
 546                            const struct iphdr *inner_iph)
 547{
 548        struct ip_tunnel *tunnel = netdev_priv(dev);
 549        int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
 550        int mtu;
 551
 552        if (df)
 553                mtu = dst_mtu(&rt->dst) - dev->hard_header_len
 554                                        - sizeof(struct iphdr) - tunnel->hlen;
 555        else
 556                mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 557
 558        if (skb_dst(skb))
 559                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 560
 561        if (skb->protocol == htons(ETH_P_IP)) {
 562                if (!skb_is_gso(skb) &&
 563                    (inner_iph->frag_off & htons(IP_DF)) &&
 564                    mtu < pkt_size) {
 565                        memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
 566                        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 567                        return -E2BIG;
 568                }
 569        }
 570#if IS_ENABLED(CONFIG_IPV6)
 571        else if (skb->protocol == htons(ETH_P_IPV6)) {
 572                struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
 573
 574                if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
 575                           mtu >= IPV6_MIN_MTU) {
 576                        if ((tunnel->parms.iph.daddr &&
 577                            !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
 578                            rt6->rt6i_dst.plen == 128) {
 579                                rt6->rt6i_flags |= RTF_MODIFIED;
 580                                dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
 581                        }
 582                }
 583
 584                if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
 585                                        mtu < pkt_size) {
 586                        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 587                        return -E2BIG;
 588                }
 589        }
 590#endif
 591        return 0;
 592}
 593
 594void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 595                    const struct iphdr *tnl_params, u8 protocol)
 596{
 597        struct ip_tunnel *tunnel = netdev_priv(dev);
 598        const struct iphdr *inner_iph;
 599        struct flowi4 fl4;
 600        u8     tos, ttl;
 601        __be16 df;
 602        struct rtable *rt;              /* Route to the other host */
 603        unsigned int max_headroom;      /* The extra header space needed */
 604        __be32 dst;
 605        bool connected;
 606
 607        inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
 608        connected = (tunnel->parms.iph.daddr != 0);
 609
 610        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 611
 612        dst = tnl_params->daddr;
 613        if (dst == 0) {
 614                /* NBMA tunnel */
 615
 616                if (!skb_dst(skb)) {
 617                        dev->stats.tx_fifo_errors++;
 618                        goto tx_error;
 619                }
 620
 621                if (skb->protocol == htons(ETH_P_IP)) {
 622                        rt = skb_rtable(skb);
 623                        dst = rt_nexthop(rt, inner_iph->daddr);
 624                }
 625#if IS_ENABLED(CONFIG_IPV6)
 626                else if (skb->protocol == htons(ETH_P_IPV6)) {
 627                        const struct in6_addr *addr6;
 628                        struct neighbour *neigh;
 629                        bool do_tx_error_icmp;
 630                        int addr_type;
 631
 632                        neigh = dst_neigh_lookup(skb_dst(skb),
 633                                                 &ipv6_hdr(skb)->daddr);
 634                        if (!neigh)
 635                                goto tx_error;
 636
 637                        addr6 = (const struct in6_addr *)&neigh->primary_key;
 638                        addr_type = ipv6_addr_type(addr6);
 639
 640                        if (addr_type == IPV6_ADDR_ANY) {
 641                                addr6 = &ipv6_hdr(skb)->daddr;
 642                                addr_type = ipv6_addr_type(addr6);
 643                        }
 644
 645                        if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
 646                                do_tx_error_icmp = true;
 647                        else {
 648                                do_tx_error_icmp = false;
 649                                dst = addr6->s6_addr32[3];
 650                        }
 651                        neigh_release(neigh);
 652                        if (do_tx_error_icmp)
 653                                goto tx_error_icmp;
 654                }
 655#endif
 656                else
 657                        goto tx_error;
 658
 659                connected = false;
 660        }
 661
 662        tos = tnl_params->tos;
 663        if (tos & 0x1) {
 664                tos &= ~0x1;
 665                if (skb->protocol == htons(ETH_P_IP)) {
 666                        tos = inner_iph->tos;
 667                        connected = false;
 668                } else if (skb->protocol == htons(ETH_P_IPV6)) {
 669                        tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
 670                        connected = false;
 671                }
 672        }
 673
 674        init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
 675                         tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
 676
 677        if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
 678                goto tx_error;
 679
 680        rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
 681                         NULL;
 682
 683        if (!rt) {
 684                rt = ip_route_output_key(tunnel->net, &fl4);
 685
 686                if (IS_ERR(rt)) {
 687                        dev->stats.tx_carrier_errors++;
 688                        goto tx_error;
 689                }
 690                if (connected)
 691                        dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
 692                                          fl4.saddr);
 693        }
 694
 695        if (rt->dst.dev == dev) {
 696                ip_rt_put(rt);
 697                dev->stats.collisions++;
 698                goto tx_error;
 699        }
 700
 701        if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
 702                ip_rt_put(rt);
 703                goto tx_error;
 704        }
 705
 706        if (tunnel->err_count > 0) {
 707                if (time_before(jiffies,
 708                                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
 709                        tunnel->err_count--;
 710
 711                        dst_link_failure(skb);
 712                } else
 713                        tunnel->err_count = 0;
 714        }
 715
 716        tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
 717        ttl = tnl_params->ttl;
 718        if (ttl == 0) {
 719                if (skb->protocol == htons(ETH_P_IP))
 720                        ttl = inner_iph->ttl;
 721#if IS_ENABLED(CONFIG_IPV6)
 722                else if (skb->protocol == htons(ETH_P_IPV6))
 723                        ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
 724#endif
 725                else
 726                        ttl = ip4_dst_hoplimit(&rt->dst);
 727        }
 728
 729        df = tnl_params->frag_off;
 730        if (skb->protocol == htons(ETH_P_IP))
 731                df |= (inner_iph->frag_off&htons(IP_DF));
 732
 733        max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
 734                        + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
 735        if (max_headroom > dev->needed_headroom)
 736                dev->needed_headroom = max_headroom;
 737
 738        if (skb_cow_head(skb, dev->needed_headroom)) {
 739                ip_rt_put(rt);
 740                dev->stats.tx_dropped++;
 741                kfree_skb(skb);
 742                return;
 743        }
 744
 745        iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
 746                      df, !net_eq(tunnel->net, dev_net(dev)));
 747        return;
 748
 749#if IS_ENABLED(CONFIG_IPV6)
 750tx_error_icmp:
 751        dst_link_failure(skb);
 752#endif
 753tx_error:
 754        dev->stats.tx_errors++;
 755        kfree_skb(skb);
 756}
 757EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
 758
 759static void ip_tunnel_update(struct ip_tunnel_net *itn,
 760                             struct ip_tunnel *t,
 761                             struct net_device *dev,
 762                             struct ip_tunnel_parm *p,
 763                             bool set_mtu)
 764{
 765        ip_tunnel_del(itn, t);
 766        t->parms.iph.saddr = p->iph.saddr;
 767        t->parms.iph.daddr = p->iph.daddr;
 768        t->parms.i_key = p->i_key;
 769        t->parms.o_key = p->o_key;
 770        if (dev->type != ARPHRD_ETHER) {
 771                memcpy(dev->dev_addr, &p->iph.saddr, 4);
 772                memcpy(dev->broadcast, &p->iph.daddr, 4);
 773        }
 774        ip_tunnel_add(itn, t);
 775
 776        t->parms.iph.ttl = p->iph.ttl;
 777        t->parms.iph.tos = p->iph.tos;
 778        t->parms.iph.frag_off = p->iph.frag_off;
 779
 780        if (t->parms.link != p->link) {
 781                int mtu;
 782
 783                t->parms.link = p->link;
 784                mtu = ip_tunnel_bind_dev(dev);
 785                if (set_mtu)
 786                        dev->mtu = mtu;
 787        }
 788        dst_cache_reset(&t->dst_cache);
 789        netdev_state_change(dev);
 790}
 791
 792int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
 793{
 794        int err = 0;
 795        struct ip_tunnel *t = netdev_priv(dev);
 796        struct net *net = t->net;
 797        struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
 798
 799        BUG_ON(!itn->fb_tunnel_dev);
 800        switch (cmd) {
 801        case SIOCGETTUNNEL:
 802                if (dev == itn->fb_tunnel_dev) {
 803                        t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 804                        if (!t)
 805                                t = netdev_priv(dev);
 806                }
 807                memcpy(p, &t->parms, sizeof(*p));
 808                break;
 809
 810        case SIOCADDTUNNEL:
 811        case SIOCCHGTUNNEL:
 812                err = -EPERM;
 813                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 814                        goto done;
 815                if (p->iph.ttl)
 816                        p->iph.frag_off |= htons(IP_DF);
 817                if (!(p->i_flags & VTI_ISVTI)) {
 818                        if (!(p->i_flags & TUNNEL_KEY))
 819                                p->i_key = 0;
 820                        if (!(p->o_flags & TUNNEL_KEY))
 821                                p->o_key = 0;
 822                }
 823
 824                t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 825
 826                if (cmd == SIOCADDTUNNEL) {
 827                        if (!t) {
 828                                t = ip_tunnel_create(net, itn, p);
 829                                err = PTR_ERR_OR_ZERO(t);
 830                                break;
 831                        }
 832
 833                        err = -EEXIST;
 834                        break;
 835                }
 836                if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
 837                        if (t) {
 838                                if (t->dev != dev) {
 839                                        err = -EEXIST;
 840                                        break;
 841                                }
 842                        } else {
 843                                unsigned int nflags = 0;
 844
 845                                if (ipv4_is_multicast(p->iph.daddr))
 846                                        nflags = IFF_BROADCAST;
 847                                else if (p->iph.daddr)
 848                                        nflags = IFF_POINTOPOINT;
 849
 850                                if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
 851                                        err = -EINVAL;
 852                                        break;
 853                                }
 854
 855                                t = netdev_priv(dev);
 856                        }
 857                }
 858
 859                if (t) {
 860                        err = 0;
 861                        ip_tunnel_update(itn, t, dev, p, true);
 862                } else {
 863                        err = -ENOENT;
 864                }
 865                break;
 866
 867        case SIOCDELTUNNEL:
 868                err = -EPERM;
 869                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 870                        goto done;
 871
 872                if (dev == itn->fb_tunnel_dev) {
 873                        err = -ENOENT;
 874                        t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 875                        if (!t)
 876                                goto done;
 877                        err = -EPERM;
 878                        if (t == netdev_priv(itn->fb_tunnel_dev))
 879                                goto done;
 880                        dev = t->dev;
 881                }
 882                unregister_netdevice(dev);
 883                err = 0;
 884                break;
 885
 886        default:
 887                err = -EINVAL;
 888        }
 889
 890done:
 891        return err;
 892}
 893EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
 894
 895int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
 896{
 897        struct ip_tunnel *tunnel = netdev_priv(dev);
 898        int t_hlen = tunnel->hlen + sizeof(struct iphdr);
 899        int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
 900
 901        if (new_mtu < 68)
 902                return -EINVAL;
 903
 904        if (new_mtu > max_mtu) {
 905                if (strict)
 906                        return -EINVAL;
 907
 908                new_mtu = max_mtu;
 909        }
 910
 911        dev->mtu = new_mtu;
 912        return 0;
 913}
 914EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
 915
 916int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
 917{
 918        return __ip_tunnel_change_mtu(dev, new_mtu, true);
 919}
 920EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
 921
 922static void ip_tunnel_dev_free(struct net_device *dev)
 923{
 924        struct ip_tunnel *tunnel = netdev_priv(dev);
 925
 926        gro_cells_destroy(&tunnel->gro_cells);
 927        dst_cache_destroy(&tunnel->dst_cache);
 928        free_percpu(dev->tstats);
 929        free_netdev(dev);
 930}
 931
 932void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
 933{
 934        struct ip_tunnel *tunnel = netdev_priv(dev);
 935        struct ip_tunnel_net *itn;
 936
 937        itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
 938
 939        if (itn->fb_tunnel_dev != dev) {
 940                ip_tunnel_del(itn, netdev_priv(dev));
 941                unregister_netdevice_queue(dev, head);
 942        }
 943}
 944EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
 945
 946struct net *ip_tunnel_get_link_net(const struct net_device *dev)
 947{
 948        struct ip_tunnel *tunnel = netdev_priv(dev);
 949
 950        return tunnel->net;
 951}
 952EXPORT_SYMBOL(ip_tunnel_get_link_net);
 953
 954int ip_tunnel_get_iflink(const struct net_device *dev)
 955{
 956        struct ip_tunnel *tunnel = netdev_priv(dev);
 957
 958        return tunnel->parms.link;
 959}
 960EXPORT_SYMBOL(ip_tunnel_get_iflink);
 961
 962int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
 963                                  struct rtnl_link_ops *ops, char *devname)
 964{
 965        struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
 966        struct ip_tunnel_parm parms;
 967        unsigned int i;
 968
 969        for (i = 0; i < IP_TNL_HASH_SIZE; i++)
 970                INIT_HLIST_HEAD(&itn->tunnels[i]);
 971
 972        if (!ops) {
 973                itn->fb_tunnel_dev = NULL;
 974                return 0;
 975        }
 976
 977        memset(&parms, 0, sizeof(parms));
 978        if (devname)
 979                strlcpy(parms.name, devname, IFNAMSIZ);
 980
 981        rtnl_lock();
 982        itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
 983        /* FB netdevice is special: we have one, and only one per netns.
 984         * Allowing to move it to another netns is clearly unsafe.
 985         */
 986        if (!IS_ERR(itn->fb_tunnel_dev)) {
 987                itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
 988                itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
 989                ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
 990        }
 991        rtnl_unlock();
 992
 993        return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
 994}
 995EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
 996
 997static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
 998                              struct rtnl_link_ops *ops)
 999{
1000        struct net *net = dev_net(itn->fb_tunnel_dev);
1001        struct net_device *dev, *aux;
1002        int h;
1003
1004        for_each_netdev_safe(net, dev, aux)
1005                if (dev->rtnl_link_ops == ops)
1006                        unregister_netdevice_queue(dev, head);
1007
1008        for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1009                struct ip_tunnel *t;
1010                struct hlist_node *n;
1011                struct hlist_head *thead = &itn->tunnels[h];
1012
1013                hlist_for_each_entry_safe(t, n, thead, hash_node)
1014                        /* If dev is in the same netns, it has already
1015                         * been added to the list by the previous loop.
1016                         */
1017                        if (!net_eq(dev_net(t->dev), net))
1018                                unregister_netdevice_queue(t->dev, head);
1019        }
1020}
1021
1022void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1023{
1024        LIST_HEAD(list);
1025
1026        rtnl_lock();
1027        ip_tunnel_destroy(itn, &list, ops);
1028        unregister_netdevice_many(&list);
1029        rtnl_unlock();
1030}
1031EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1032
1033int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1034                      struct ip_tunnel_parm *p)
1035{
1036        struct ip_tunnel *nt;
1037        struct net *net = dev_net(dev);
1038        struct ip_tunnel_net *itn;
1039        int mtu;
1040        int err;
1041
1042        nt = netdev_priv(dev);
1043        itn = net_generic(net, nt->ip_tnl_net_id);
1044
1045        if (nt->collect_md) {
1046                if (rtnl_dereference(itn->collect_md_tun))
1047                        return -EEXIST;
1048        } else {
1049                if (ip_tunnel_find(itn, p, dev->type))
1050                        return -EEXIST;
1051        }
1052
1053        nt->net = net;
1054        nt->parms = *p;
1055        err = register_netdevice(dev);
1056        if (err)
1057                goto out;
1058
1059        if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1060                eth_hw_addr_random(dev);
1061
1062        mtu = ip_tunnel_bind_dev(dev);
1063        if (!tb[IFLA_MTU])
1064                dev->mtu = mtu;
1065
1066        ip_tunnel_add(itn, nt);
1067out:
1068        return err;
1069}
1070EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1071
1072int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1073                         struct ip_tunnel_parm *p)
1074{
1075        struct ip_tunnel *t;
1076        struct ip_tunnel *tunnel = netdev_priv(dev);
1077        struct net *net = tunnel->net;
1078        struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1079
1080        if (dev == itn->fb_tunnel_dev)
1081                return -EINVAL;
1082
1083        t = ip_tunnel_find(itn, p, dev->type);
1084
1085        if (t) {
1086                if (t->dev != dev)
1087                        return -EEXIST;
1088        } else {
1089                t = tunnel;
1090
1091                if (dev->type != ARPHRD_ETHER) {
1092                        unsigned int nflags = 0;
1093
1094                        if (ipv4_is_multicast(p->iph.daddr))
1095                                nflags = IFF_BROADCAST;
1096                        else if (p->iph.daddr)
1097                                nflags = IFF_POINTOPOINT;
1098
1099                        if ((dev->flags ^ nflags) &
1100                            (IFF_POINTOPOINT | IFF_BROADCAST))
1101                                return -EINVAL;
1102                }
1103        }
1104
1105        ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1106        return 0;
1107}
1108EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1109
1110int ip_tunnel_init(struct net_device *dev)
1111{
1112        struct ip_tunnel *tunnel = netdev_priv(dev);
1113        struct iphdr *iph = &tunnel->parms.iph;
1114        int err;
1115
1116        dev->destructor = ip_tunnel_dev_free;
1117        dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1118        if (!dev->tstats)
1119                return -ENOMEM;
1120
1121        err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1122        if (err) {
1123                free_percpu(dev->tstats);
1124                return err;
1125        }
1126
1127        err = gro_cells_init(&tunnel->gro_cells, dev);
1128        if (err) {
1129                dst_cache_destroy(&tunnel->dst_cache);
1130                free_percpu(dev->tstats);
1131                return err;
1132        }
1133
1134        tunnel->dev = dev;
1135        tunnel->net = dev_net(dev);
1136        strcpy(tunnel->parms.name, dev->name);
1137        iph->version            = 4;
1138        iph->ihl                = 5;
1139
1140        if (tunnel->collect_md) {
1141                dev->features |= NETIF_F_NETNS_LOCAL;
1142                netif_keep_dst(dev);
1143        }
1144        return 0;
1145}
1146EXPORT_SYMBOL_GPL(ip_tunnel_init);
1147
1148void ip_tunnel_uninit(struct net_device *dev)
1149{
1150        struct ip_tunnel *tunnel = netdev_priv(dev);
1151        struct net *net = tunnel->net;
1152        struct ip_tunnel_net *itn;
1153
1154        itn = net_generic(net, tunnel->ip_tnl_net_id);
1155        /* fb_tunnel_dev will be unregisted in net-exit call. */
1156        if (itn->fb_tunnel_dev != dev)
1157                ip_tunnel_del(itn, netdev_priv(dev));
1158
1159        dst_cache_reset(&tunnel->dst_cache);
1160}
1161EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1162
1163/* Do least required initialization, rest of init is done in tunnel_init call */
1164void ip_tunnel_setup(struct net_device *dev, int net_id)
1165{
1166        struct ip_tunnel *tunnel = netdev_priv(dev);
1167        tunnel->ip_tnl_net_id = net_id;
1168}
1169EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1170
1171MODULE_LICENSE("GPL");
1172