linux/net/ipv4/ip_tunnel.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2013 Nicira, Inc.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of version 2 of the GNU General Public
   6 * License as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful, but
   9 * WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public License
  14 * along with this program; if not, write to the Free Software
  15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16 * 02110-1301, USA
  17 */
  18
  19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  20
  21#include <linux/capability.h>
  22#include <linux/module.h>
  23#include <linux/types.h>
  24#include <linux/kernel.h>
  25#include <linux/slab.h>
  26#include <linux/uaccess.h>
  27#include <linux/skbuff.h>
  28#include <linux/netdevice.h>
  29#include <linux/in.h>
  30#include <linux/tcp.h>
  31#include <linux/udp.h>
  32#include <linux/if_arp.h>
  33#include <linux/init.h>
  34#include <linux/in6.h>
  35#include <linux/inetdevice.h>
  36#include <linux/igmp.h>
  37#include <linux/netfilter_ipv4.h>
  38#include <linux/etherdevice.h>
  39#include <linux/if_ether.h>
  40#include <linux/if_vlan.h>
  41#include <linux/rculist.h>
  42#include <linux/err.h>
  43
  44#include <net/sock.h>
  45#include <net/ip.h>
  46#include <net/icmp.h>
  47#include <net/protocol.h>
  48#include <net/ip_tunnels.h>
  49#include <net/arp.h>
  50#include <net/checksum.h>
  51#include <net/dsfield.h>
  52#include <net/inet_ecn.h>
  53#include <net/xfrm.h>
  54#include <net/net_namespace.h>
  55#include <net/netns/generic.h>
  56#include <net/rtnetlink.h>
  57#include <net/udp.h>
  58#include <net/dst_metadata.h>
  59
  60#if IS_ENABLED(CONFIG_IPV6)
  61#include <net/ipv6.h>
  62#include <net/ip6_fib.h>
  63#include <net/ip6_route.h>
  64#endif
  65
  66static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
  67{
  68        return hash_32((__force u32)key ^ (__force u32)remote,
  69                         IP_TNL_HASH_BITS);
  70}
  71
  72static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
  73                                __be16 flags, __be32 key)
  74{
  75        if (p->i_flags & TUNNEL_KEY) {
  76                if (flags & TUNNEL_KEY)
  77                        return key == p->i_key;
  78                else
  79                        /* key expected, none present */
  80                        return false;
  81        } else
  82                return !(flags & TUNNEL_KEY);
  83}
  84
  85/* Fallback tunnel: no source, no destination, no key, no options
  86
  87   Tunnel hash table:
  88   We require exact key match i.e. if a key is present in packet
  89   it will match only tunnel with the same key; if it is not present,
  90   it will match only keyless tunnel.
  91
  92   All keysless packets, if not matched configured keyless tunnels
  93   will match fallback tunnel.
  94   Given src, dst and key, find appropriate for input tunnel.
  95*/
  96struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
  97                                   int link, __be16 flags,
  98                                   __be32 remote, __be32 local,
  99                                   __be32 key)
 100{
 101        unsigned int hash;
 102        struct ip_tunnel *t, *cand = NULL;
 103        struct hlist_head *head;
 104
 105        hash = ip_tunnel_hash(key, remote);
 106        head = &itn->tunnels[hash];
 107
 108        hlist_for_each_entry_rcu(t, head, hash_node) {
 109                if (local != t->parms.iph.saddr ||
 110                    remote != t->parms.iph.daddr ||
 111                    !(t->dev->flags & IFF_UP))
 112                        continue;
 113
 114                if (!ip_tunnel_key_match(&t->parms, flags, key))
 115                        continue;
 116
 117                if (t->parms.link == link)
 118                        return t;
 119                else
 120                        cand = t;
 121        }
 122
 123        hlist_for_each_entry_rcu(t, head, hash_node) {
 124                if (remote != t->parms.iph.daddr ||
 125                    t->parms.iph.saddr != 0 ||
 126                    !(t->dev->flags & IFF_UP))
 127                        continue;
 128
 129                if (!ip_tunnel_key_match(&t->parms, flags, key))
 130                        continue;
 131
 132                if (t->parms.link == link)
 133                        return t;
 134                else if (!cand)
 135                        cand = t;
 136        }
 137
 138        hash = ip_tunnel_hash(key, 0);
 139        head = &itn->tunnels[hash];
 140
 141        hlist_for_each_entry_rcu(t, head, hash_node) {
 142                if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
 143                    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
 144                        continue;
 145
 146                if (!(t->dev->flags & IFF_UP))
 147                        continue;
 148
 149                if (!ip_tunnel_key_match(&t->parms, flags, key))
 150                        continue;
 151
 152                if (t->parms.link == link)
 153                        return t;
 154                else if (!cand)
 155                        cand = t;
 156        }
 157
 158        if (flags & TUNNEL_NO_KEY)
 159                goto skip_key_lookup;
 160
 161        hlist_for_each_entry_rcu(t, head, hash_node) {
 162                if (t->parms.i_key != key ||
 163                    t->parms.iph.saddr != 0 ||
 164                    t->parms.iph.daddr != 0 ||
 165                    !(t->dev->flags & IFF_UP))
 166                        continue;
 167
 168                if (t->parms.link == link)
 169                        return t;
 170                else if (!cand)
 171                        cand = t;
 172        }
 173
 174skip_key_lookup:
 175        if (cand)
 176                return cand;
 177
 178        t = rcu_dereference(itn->collect_md_tun);
 179        if (t && t->dev->flags & IFF_UP)
 180                return t;
 181
 182        if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
 183                return netdev_priv(itn->fb_tunnel_dev);
 184
 185        return NULL;
 186}
 187EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
 188
 189static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
 190                                    struct ip_tunnel_parm *parms)
 191{
 192        unsigned int h;
 193        __be32 remote;
 194        __be32 i_key = parms->i_key;
 195
 196        if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
 197                remote = parms->iph.daddr;
 198        else
 199                remote = 0;
 200
 201        if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
 202                i_key = 0;
 203
 204        h = ip_tunnel_hash(i_key, remote);
 205        return &itn->tunnels[h];
 206}
 207
 208static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 209{
 210        struct hlist_head *head = ip_bucket(itn, &t->parms);
 211
 212        if (t->collect_md)
 213                rcu_assign_pointer(itn->collect_md_tun, t);
 214        hlist_add_head_rcu(&t->hash_node, head);
 215}
 216
 217static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 218{
 219        if (t->collect_md)
 220                rcu_assign_pointer(itn->collect_md_tun, NULL);
 221        hlist_del_init_rcu(&t->hash_node);
 222}
 223
 224static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
 225                                        struct ip_tunnel_parm *parms,
 226                                        int type)
 227{
 228        __be32 remote = parms->iph.daddr;
 229        __be32 local = parms->iph.saddr;
 230        __be32 key = parms->i_key;
 231        __be16 flags = parms->i_flags;
 232        int link = parms->link;
 233        struct ip_tunnel *t = NULL;
 234        struct hlist_head *head = ip_bucket(itn, parms);
 235
 236        hlist_for_each_entry_rcu(t, head, hash_node) {
 237                if (local == t->parms.iph.saddr &&
 238                    remote == t->parms.iph.daddr &&
 239                    link == t->parms.link &&
 240                    type == t->dev->type &&
 241                    ip_tunnel_key_match(&t->parms, flags, key))
 242                        break;
 243        }
 244        return t;
 245}
 246
 247static struct net_device *__ip_tunnel_create(struct net *net,
 248                                             const struct rtnl_link_ops *ops,
 249                                             struct ip_tunnel_parm *parms)
 250{
 251        int err;
 252        struct ip_tunnel *tunnel;
 253        struct net_device *dev;
 254        char name[IFNAMSIZ];
 255
 256        if (parms->name[0])
 257                strlcpy(name, parms->name, IFNAMSIZ);
 258        else {
 259                if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
 260                        err = -E2BIG;
 261                        goto failed;
 262                }
 263                strlcpy(name, ops->kind, IFNAMSIZ);
 264                strncat(name, "%d", 2);
 265        }
 266
 267        ASSERT_RTNL();
 268        dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
 269        if (!dev) {
 270                err = -ENOMEM;
 271                goto failed;
 272        }
 273        dev_net_set(dev, net);
 274
 275        dev->rtnl_link_ops = ops;
 276
 277        tunnel = netdev_priv(dev);
 278        tunnel->parms = *parms;
 279        tunnel->net = net;
 280
 281        err = register_netdevice(dev);
 282        if (err)
 283                goto failed_free;
 284
 285        return dev;
 286
 287failed_free:
 288        free_netdev(dev);
 289failed:
 290        return ERR_PTR(err);
 291}
 292
 293static inline void init_tunnel_flow(struct flowi4 *fl4,
 294                                    int proto,
 295                                    __be32 daddr, __be32 saddr,
 296                                    __be32 key, __u8 tos, int oif,
 297                                    __u32 mark)
 298{
 299        memset(fl4, 0, sizeof(*fl4));
 300        fl4->flowi4_oif = oif;
 301        fl4->daddr = daddr;
 302        fl4->saddr = saddr;
 303        fl4->flowi4_tos = tos;
 304        fl4->flowi4_proto = proto;
 305        fl4->fl4_gre_key = key;
 306        fl4->flowi4_mark = mark;
 307}
 308
 309static int ip_tunnel_bind_dev(struct net_device *dev)
 310{
 311        struct net_device *tdev = NULL;
 312        struct ip_tunnel *tunnel = netdev_priv(dev);
 313        const struct iphdr *iph;
 314        int hlen = LL_MAX_HEADER;
 315        int mtu = ETH_DATA_LEN;
 316        int t_hlen = tunnel->hlen + sizeof(struct iphdr);
 317
 318        iph = &tunnel->parms.iph;
 319
 320        /* Guess output device to choose reasonable mtu and needed_headroom */
 321        if (iph->daddr) {
 322                struct flowi4 fl4;
 323                struct rtable *rt;
 324
 325                init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
 326                                 iph->saddr, tunnel->parms.o_key,
 327                                 RT_TOS(iph->tos), tunnel->parms.link,
 328                                 tunnel->fwmark);
 329                rt = ip_route_output_key(tunnel->net, &fl4);
 330
 331                if (!IS_ERR(rt)) {
 332                        tdev = rt->dst.dev;
 333                        ip_rt_put(rt);
 334                }
 335                if (dev->type != ARPHRD_ETHER)
 336                        dev->flags |= IFF_POINTOPOINT;
 337
 338                dst_cache_reset(&tunnel->dst_cache);
 339        }
 340
 341        if (!tdev && tunnel->parms.link)
 342                tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
 343
 344        if (tdev) {
 345                hlen = tdev->hard_header_len + tdev->needed_headroom;
 346                mtu = tdev->mtu;
 347        }
 348
 349        dev->needed_headroom = t_hlen + hlen;
 350        mtu -= (dev->hard_header_len + t_hlen);
 351
 352        if (mtu < IPV4_MIN_MTU)
 353                mtu = IPV4_MIN_MTU;
 354
 355        return mtu;
 356}
 357
 358static struct ip_tunnel *ip_tunnel_create(struct net *net,
 359                                          struct ip_tunnel_net *itn,
 360                                          struct ip_tunnel_parm *parms)
 361{
 362        struct ip_tunnel *nt;
 363        struct net_device *dev;
 364        int t_hlen;
 365
 366        BUG_ON(!itn->fb_tunnel_dev);
 367        dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
 368        if (IS_ERR(dev))
 369                return ERR_CAST(dev);
 370
 371        dev->mtu = ip_tunnel_bind_dev(dev);
 372
 373        nt = netdev_priv(dev);
 374        t_hlen = nt->hlen + sizeof(struct iphdr);
 375        dev->min_mtu = ETH_MIN_MTU;
 376        dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
 377        ip_tunnel_add(itn, nt);
 378        return nt;
 379}
 380
 381int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 382                  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
 383                  bool log_ecn_error)
 384{
 385        struct pcpu_sw_netstats *tstats;
 386        const struct iphdr *iph = ip_hdr(skb);
 387        int err;
 388
 389#ifdef CONFIG_NET_IPGRE_BROADCAST
 390        if (ipv4_is_multicast(iph->daddr)) {
 391                tunnel->dev->stats.multicast++;
 392                skb->pkt_type = PACKET_BROADCAST;
 393        }
 394#endif
 395
 396        if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
 397             ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
 398                tunnel->dev->stats.rx_crc_errors++;
 399                tunnel->dev->stats.rx_errors++;
 400                goto drop;
 401        }
 402
 403        if (tunnel->parms.i_flags&TUNNEL_SEQ) {
 404                if (!(tpi->flags&TUNNEL_SEQ) ||
 405                    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
 406                        tunnel->dev->stats.rx_fifo_errors++;
 407                        tunnel->dev->stats.rx_errors++;
 408                        goto drop;
 409                }
 410                tunnel->i_seqno = ntohl(tpi->seq) + 1;
 411        }
 412
 413        skb_reset_network_header(skb);
 414
 415        err = IP_ECN_decapsulate(iph, skb);
 416        if (unlikely(err)) {
 417                if (log_ecn_error)
 418                        net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
 419                                        &iph->saddr, iph->tos);
 420                if (err > 1) {
 421                        ++tunnel->dev->stats.rx_frame_errors;
 422                        ++tunnel->dev->stats.rx_errors;
 423                        goto drop;
 424                }
 425        }
 426
 427        tstats = this_cpu_ptr(tunnel->dev->tstats);
 428        u64_stats_update_begin(&tstats->syncp);
 429        tstats->rx_packets++;
 430        tstats->rx_bytes += skb->len;
 431        u64_stats_update_end(&tstats->syncp);
 432
 433        skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
 434
 435        if (tunnel->dev->type == ARPHRD_ETHER) {
 436                skb->protocol = eth_type_trans(skb, tunnel->dev);
 437                skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 438        } else {
 439                skb->dev = tunnel->dev;
 440        }
 441
 442        if (tun_dst)
 443                skb_dst_set(skb, (struct dst_entry *)tun_dst);
 444
 445        gro_cells_receive(&tunnel->gro_cells, skb);
 446        return 0;
 447
 448drop:
 449        if (tun_dst)
 450                dst_release((struct dst_entry *)tun_dst);
 451        kfree_skb(skb);
 452        return 0;
 453}
 454EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
 455
 456int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
 457                            unsigned int num)
 458{
 459        if (num >= MAX_IPTUN_ENCAP_OPS)
 460                return -ERANGE;
 461
 462        return !cmpxchg((const struct ip_tunnel_encap_ops **)
 463                        &iptun_encaps[num],
 464                        NULL, ops) ? 0 : -1;
 465}
 466EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
 467
 468int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
 469                            unsigned int num)
 470{
 471        int ret;
 472
 473        if (num >= MAX_IPTUN_ENCAP_OPS)
 474                return -ERANGE;
 475
 476        ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
 477                       &iptun_encaps[num],
 478                       ops, NULL) == ops) ? 0 : -1;
 479
 480        synchronize_net();
 481
 482        return ret;
 483}
 484EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
 485
 486int ip_tunnel_encap_setup(struct ip_tunnel *t,
 487                          struct ip_tunnel_encap *ipencap)
 488{
 489        int hlen;
 490
 491        memset(&t->encap, 0, sizeof(t->encap));
 492
 493        hlen = ip_encap_hlen(ipencap);
 494        if (hlen < 0)
 495                return hlen;
 496
 497        t->encap.type = ipencap->type;
 498        t->encap.sport = ipencap->sport;
 499        t->encap.dport = ipencap->dport;
 500        t->encap.flags = ipencap->flags;
 501
 502        t->encap_hlen = hlen;
 503        t->hlen = t->encap_hlen + t->tun_hlen;
 504
 505        return 0;
 506}
 507EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
 508
 509static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
 510                            struct rtable *rt, __be16 df,
 511                            const struct iphdr *inner_iph)
 512{
 513        struct ip_tunnel *tunnel = netdev_priv(dev);
 514        int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
 515        int mtu;
 516
 517        if (df)
 518                mtu = dst_mtu(&rt->dst) - dev->hard_header_len
 519                                        - sizeof(struct iphdr) - tunnel->hlen;
 520        else
 521                mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 522
 523        skb_dst_update_pmtu(skb, mtu);
 524
 525        if (skb->protocol == htons(ETH_P_IP)) {
 526                if (!skb_is_gso(skb) &&
 527                    (inner_iph->frag_off & htons(IP_DF)) &&
 528                    mtu < pkt_size) {
 529                        memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
 530                        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 531                        return -E2BIG;
 532                }
 533        }
 534#if IS_ENABLED(CONFIG_IPV6)
 535        else if (skb->protocol == htons(ETH_P_IPV6)) {
 536                struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
 537
 538                if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
 539                           mtu >= IPV6_MIN_MTU) {
 540                        if ((tunnel->parms.iph.daddr &&
 541                            !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
 542                            rt6->rt6i_dst.plen == 128) {
 543                                rt6->rt6i_flags |= RTF_MODIFIED;
 544                                dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
 545                        }
 546                }
 547
 548                if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
 549                                        mtu < pkt_size) {
 550                        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 551                        return -E2BIG;
 552                }
 553        }
 554#endif
 555        return 0;
 556}
 557
 558void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
 559{
 560        struct ip_tunnel *tunnel = netdev_priv(dev);
 561        u32 headroom = sizeof(struct iphdr);
 562        struct ip_tunnel_info *tun_info;
 563        const struct ip_tunnel_key *key;
 564        const struct iphdr *inner_iph;
 565        struct rtable *rt;
 566        struct flowi4 fl4;
 567        __be16 df = 0;
 568        u8 tos, ttl;
 569
 570        tun_info = skb_tunnel_info(skb);
 571        if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
 572                     ip_tunnel_info_af(tun_info) != AF_INET))
 573                goto tx_error;
 574        key = &tun_info->key;
 575        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 576        inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
 577        tos = key->tos;
 578        if (tos == 1) {
 579                if (skb->protocol == htons(ETH_P_IP))
 580                        tos = inner_iph->tos;
 581                else if (skb->protocol == htons(ETH_P_IPV6))
 582                        tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
 583        }
 584        init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
 585                         RT_TOS(tos), tunnel->parms.link, tunnel->fwmark);
 586        if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
 587                goto tx_error;
 588        rt = ip_route_output_key(tunnel->net, &fl4);
 589        if (IS_ERR(rt)) {
 590                dev->stats.tx_carrier_errors++;
 591                goto tx_error;
 592        }
 593        if (rt->dst.dev == dev) {
 594                ip_rt_put(rt);
 595                dev->stats.collisions++;
 596                goto tx_error;
 597        }
 598        tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
 599        ttl = key->ttl;
 600        if (ttl == 0) {
 601                if (skb->protocol == htons(ETH_P_IP))
 602                        ttl = inner_iph->ttl;
 603                else if (skb->protocol == htons(ETH_P_IPV6))
 604                        ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
 605                else
 606                        ttl = ip4_dst_hoplimit(&rt->dst);
 607        }
 608        if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
 609                df = htons(IP_DF);
 610        else if (skb->protocol == htons(ETH_P_IP))
 611                df = inner_iph->frag_off & htons(IP_DF);
 612        headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
 613        if (headroom > dev->needed_headroom)
 614                dev->needed_headroom = headroom;
 615
 616        if (skb_cow_head(skb, dev->needed_headroom)) {
 617                ip_rt_put(rt);
 618                goto tx_dropped;
 619        }
 620        iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
 621                      df, !net_eq(tunnel->net, dev_net(dev)));
 622        return;
 623tx_error:
 624        dev->stats.tx_errors++;
 625        goto kfree;
 626tx_dropped:
 627        dev->stats.tx_dropped++;
 628kfree:
 629        kfree_skb(skb);
 630}
 631EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
 632
 633void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 634                    const struct iphdr *tnl_params, u8 protocol)
 635{
 636        struct ip_tunnel *tunnel = netdev_priv(dev);
 637        const struct iphdr *inner_iph;
 638        struct flowi4 fl4;
 639        u8     tos, ttl;
 640        __be16 df;
 641        struct rtable *rt;              /* Route to the other host */
 642        unsigned int max_headroom;      /* The extra header space needed */
 643        __be32 dst;
 644        bool connected;
 645
 646        inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
 647        connected = (tunnel->parms.iph.daddr != 0);
 648
 649        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 650
 651        dst = tnl_params->daddr;
 652        if (dst == 0) {
 653                /* NBMA tunnel */
 654
 655                if (!skb_dst(skb)) {
 656                        dev->stats.tx_fifo_errors++;
 657                        goto tx_error;
 658                }
 659
 660                if (skb->protocol == htons(ETH_P_IP)) {
 661                        rt = skb_rtable(skb);
 662                        dst = rt_nexthop(rt, inner_iph->daddr);
 663                }
 664#if IS_ENABLED(CONFIG_IPV6)
 665                else if (skb->protocol == htons(ETH_P_IPV6)) {
 666                        const struct in6_addr *addr6;
 667                        struct neighbour *neigh;
 668                        bool do_tx_error_icmp;
 669                        int addr_type;
 670
 671                        neigh = dst_neigh_lookup(skb_dst(skb),
 672                                                 &ipv6_hdr(skb)->daddr);
 673                        if (!neigh)
 674                                goto tx_error;
 675
 676                        addr6 = (const struct in6_addr *)&neigh->primary_key;
 677                        addr_type = ipv6_addr_type(addr6);
 678
 679                        if (addr_type == IPV6_ADDR_ANY) {
 680                                addr6 = &ipv6_hdr(skb)->daddr;
 681                                addr_type = ipv6_addr_type(addr6);
 682                        }
 683
 684                        if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
 685                                do_tx_error_icmp = true;
 686                        else {
 687                                do_tx_error_icmp = false;
 688                                dst = addr6->s6_addr32[3];
 689                        }
 690                        neigh_release(neigh);
 691                        if (do_tx_error_icmp)
 692                                goto tx_error_icmp;
 693                }
 694#endif
 695                else
 696                        goto tx_error;
 697
 698                connected = false;
 699        }
 700
 701        tos = tnl_params->tos;
 702        if (tos & 0x1) {
 703                tos &= ~0x1;
 704                if (skb->protocol == htons(ETH_P_IP)) {
 705                        tos = inner_iph->tos;
 706                        connected = false;
 707                } else if (skb->protocol == htons(ETH_P_IPV6)) {
 708                        tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
 709                        connected = false;
 710                }
 711        }
 712
 713        init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
 714                         tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
 715                         tunnel->fwmark);
 716
 717        if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
 718                goto tx_error;
 719
 720        rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
 721                         NULL;
 722
 723        if (!rt) {
 724                rt = ip_route_output_key(tunnel->net, &fl4);
 725
 726                if (IS_ERR(rt)) {
 727                        dev->stats.tx_carrier_errors++;
 728                        goto tx_error;
 729                }
 730                if (connected)
 731                        dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
 732                                          fl4.saddr);
 733        }
 734
 735        if (rt->dst.dev == dev) {
 736                ip_rt_put(rt);
 737                dev->stats.collisions++;
 738                goto tx_error;
 739        }
 740
 741        if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
 742                ip_rt_put(rt);
 743                goto tx_error;
 744        }
 745
 746        if (tunnel->err_count > 0) {
 747                if (time_before(jiffies,
 748                                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
 749                        tunnel->err_count--;
 750
 751                        dst_link_failure(skb);
 752                } else
 753                        tunnel->err_count = 0;
 754        }
 755
 756        tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
 757        ttl = tnl_params->ttl;
 758        if (ttl == 0) {
 759                if (skb->protocol == htons(ETH_P_IP))
 760                        ttl = inner_iph->ttl;
 761#if IS_ENABLED(CONFIG_IPV6)
 762                else if (skb->protocol == htons(ETH_P_IPV6))
 763                        ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
 764#endif
 765                else
 766                        ttl = ip4_dst_hoplimit(&rt->dst);
 767        }
 768
 769        df = tnl_params->frag_off;
 770        if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
 771                df |= (inner_iph->frag_off&htons(IP_DF));
 772
 773        max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
 774                        + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
 775        if (max_headroom > dev->needed_headroom)
 776                dev->needed_headroom = max_headroom;
 777
 778        if (skb_cow_head(skb, dev->needed_headroom)) {
 779                ip_rt_put(rt);
 780                dev->stats.tx_dropped++;
 781                kfree_skb(skb);
 782                return;
 783        }
 784
 785        iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
 786                      df, !net_eq(tunnel->net, dev_net(dev)));
 787        return;
 788
 789#if IS_ENABLED(CONFIG_IPV6)
 790tx_error_icmp:
 791        dst_link_failure(skb);
 792#endif
 793tx_error:
 794        dev->stats.tx_errors++;
 795        kfree_skb(skb);
 796}
 797EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
 798
 799static void ip_tunnel_update(struct ip_tunnel_net *itn,
 800                             struct ip_tunnel *t,
 801                             struct net_device *dev,
 802                             struct ip_tunnel_parm *p,
 803                             bool set_mtu,
 804                             __u32 fwmark)
 805{
 806        ip_tunnel_del(itn, t);
 807        t->parms.iph.saddr = p->iph.saddr;
 808        t->parms.iph.daddr = p->iph.daddr;
 809        t->parms.i_key = p->i_key;
 810        t->parms.o_key = p->o_key;
 811        if (dev->type != ARPHRD_ETHER) {
 812                memcpy(dev->dev_addr, &p->iph.saddr, 4);
 813                memcpy(dev->broadcast, &p->iph.daddr, 4);
 814        }
 815        ip_tunnel_add(itn, t);
 816
 817        t->parms.iph.ttl = p->iph.ttl;
 818        t->parms.iph.tos = p->iph.tos;
 819        t->parms.iph.frag_off = p->iph.frag_off;
 820
 821        if (t->parms.link != p->link || t->fwmark != fwmark) {
 822                int mtu;
 823
 824                t->parms.link = p->link;
 825                t->fwmark = fwmark;
 826                mtu = ip_tunnel_bind_dev(dev);
 827                if (set_mtu)
 828                        dev->mtu = mtu;
 829        }
 830        dst_cache_reset(&t->dst_cache);
 831        netdev_state_change(dev);
 832}
 833
 834int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
 835{
 836        int err = 0;
 837        struct ip_tunnel *t = netdev_priv(dev);
 838        struct net *net = t->net;
 839        struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
 840
 841        BUG_ON(!itn->fb_tunnel_dev);
 842        switch (cmd) {
 843        case SIOCGETTUNNEL:
 844                if (dev == itn->fb_tunnel_dev) {
 845                        t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 846                        if (!t)
 847                                t = netdev_priv(dev);
 848                }
 849                memcpy(p, &t->parms, sizeof(*p));
 850                break;
 851
 852        case SIOCADDTUNNEL:
 853        case SIOCCHGTUNNEL:
 854                err = -EPERM;
 855                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 856                        goto done;
 857                if (p->iph.ttl)
 858                        p->iph.frag_off |= htons(IP_DF);
 859                if (!(p->i_flags & VTI_ISVTI)) {
 860                        if (!(p->i_flags & TUNNEL_KEY))
 861                                p->i_key = 0;
 862                        if (!(p->o_flags & TUNNEL_KEY))
 863                                p->o_key = 0;
 864                }
 865
 866                t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 867
 868                if (cmd == SIOCADDTUNNEL) {
 869                        if (!t) {
 870                                t = ip_tunnel_create(net, itn, p);
 871                                err = PTR_ERR_OR_ZERO(t);
 872                                break;
 873                        }
 874
 875                        err = -EEXIST;
 876                        break;
 877                }
 878                if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
 879                        if (t) {
 880                                if (t->dev != dev) {
 881                                        err = -EEXIST;
 882                                        break;
 883                                }
 884                        } else {
 885                                unsigned int nflags = 0;
 886
 887                                if (ipv4_is_multicast(p->iph.daddr))
 888                                        nflags = IFF_BROADCAST;
 889                                else if (p->iph.daddr)
 890                                        nflags = IFF_POINTOPOINT;
 891
 892                                if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
 893                                        err = -EINVAL;
 894                                        break;
 895                                }
 896
 897                                t = netdev_priv(dev);
 898                        }
 899                }
 900
 901                if (t) {
 902                        err = 0;
 903                        ip_tunnel_update(itn, t, dev, p, true, 0);
 904                } else {
 905                        err = -ENOENT;
 906                }
 907                break;
 908
 909        case SIOCDELTUNNEL:
 910                err = -EPERM;
 911                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 912                        goto done;
 913
 914                if (dev == itn->fb_tunnel_dev) {
 915                        err = -ENOENT;
 916                        t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 917                        if (!t)
 918                                goto done;
 919                        err = -EPERM;
 920                        if (t == netdev_priv(itn->fb_tunnel_dev))
 921                                goto done;
 922                        dev = t->dev;
 923                }
 924                unregister_netdevice(dev);
 925                err = 0;
 926                break;
 927
 928        default:
 929                err = -EINVAL;
 930        }
 931
 932done:
 933        return err;
 934}
 935EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
 936
 937int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
 938{
 939        struct ip_tunnel *tunnel = netdev_priv(dev);
 940        int t_hlen = tunnel->hlen + sizeof(struct iphdr);
 941        int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
 942
 943        if (new_mtu < ETH_MIN_MTU)
 944                return -EINVAL;
 945
 946        if (new_mtu > max_mtu) {
 947                if (strict)
 948                        return -EINVAL;
 949
 950                new_mtu = max_mtu;
 951        }
 952
 953        dev->mtu = new_mtu;
 954        return 0;
 955}
 956EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
 957
 958int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
 959{
 960        return __ip_tunnel_change_mtu(dev, new_mtu, true);
 961}
 962EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
 963
 964static void ip_tunnel_dev_free(struct net_device *dev)
 965{
 966        struct ip_tunnel *tunnel = netdev_priv(dev);
 967
 968        gro_cells_destroy(&tunnel->gro_cells);
 969        dst_cache_destroy(&tunnel->dst_cache);
 970        free_percpu(dev->tstats);
 971}
 972
 973void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
 974{
 975        struct ip_tunnel *tunnel = netdev_priv(dev);
 976        struct ip_tunnel_net *itn;
 977
 978        itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
 979
 980        if (itn->fb_tunnel_dev != dev) {
 981                ip_tunnel_del(itn, netdev_priv(dev));
 982                unregister_netdevice_queue(dev, head);
 983        }
 984}
 985EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
 986
 987struct net *ip_tunnel_get_link_net(const struct net_device *dev)
 988{
 989        struct ip_tunnel *tunnel = netdev_priv(dev);
 990
 991        return tunnel->net;
 992}
 993EXPORT_SYMBOL(ip_tunnel_get_link_net);
 994
 995int ip_tunnel_get_iflink(const struct net_device *dev)
 996{
 997        struct ip_tunnel *tunnel = netdev_priv(dev);
 998
 999        return tunnel->parms.link;
1000}
1001EXPORT_SYMBOL(ip_tunnel_get_iflink);
1002
1003int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1004                                  struct rtnl_link_ops *ops, char *devname)
1005{
1006        struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1007        struct ip_tunnel_parm parms;
1008        unsigned int i;
1009
1010        for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1011                INIT_HLIST_HEAD(&itn->tunnels[i]);
1012
1013        if (!ops) {
1014                itn->fb_tunnel_dev = NULL;
1015                return 0;
1016        }
1017
1018        memset(&parms, 0, sizeof(parms));
1019        if (devname)
1020                strlcpy(parms.name, devname, IFNAMSIZ);
1021
1022        rtnl_lock();
1023        itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1024        /* FB netdevice is special: we have one, and only one per netns.
1025         * Allowing to move it to another netns is clearly unsafe.
1026         */
1027        if (!IS_ERR(itn->fb_tunnel_dev)) {
1028                itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1029                itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1030                ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1031        }
1032        rtnl_unlock();
1033
1034        return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1035}
1036EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1037
1038static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1039                              struct rtnl_link_ops *ops)
1040{
1041        struct net *net = dev_net(itn->fb_tunnel_dev);
1042        struct net_device *dev, *aux;
1043        int h;
1044
1045        for_each_netdev_safe(net, dev, aux)
1046                if (dev->rtnl_link_ops == ops)
1047                        unregister_netdevice_queue(dev, head);
1048
1049        for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1050                struct ip_tunnel *t;
1051                struct hlist_node *n;
1052                struct hlist_head *thead = &itn->tunnels[h];
1053
1054                hlist_for_each_entry_safe(t, n, thead, hash_node)
1055                        /* If dev is in the same netns, it has already
1056                         * been added to the list by the previous loop.
1057                         */
1058                        if (!net_eq(dev_net(t->dev), net))
1059                                unregister_netdevice_queue(t->dev, head);
1060        }
1061}
1062
1063void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1064                           struct rtnl_link_ops *ops)
1065{
1066        struct ip_tunnel_net *itn;
1067        struct net *net;
1068        LIST_HEAD(list);
1069
1070        rtnl_lock();
1071        list_for_each_entry(net, net_list, exit_list) {
1072                itn = net_generic(net, id);
1073                ip_tunnel_destroy(itn, &list, ops);
1074        }
1075        unregister_netdevice_many(&list);
1076        rtnl_unlock();
1077}
1078EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1079
1080int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1081                      struct ip_tunnel_parm *p, __u32 fwmark)
1082{
1083        struct ip_tunnel *nt;
1084        struct net *net = dev_net(dev);
1085        struct ip_tunnel_net *itn;
1086        int mtu;
1087        int err;
1088
1089        nt = netdev_priv(dev);
1090        itn = net_generic(net, nt->ip_tnl_net_id);
1091
1092        if (nt->collect_md) {
1093                if (rtnl_dereference(itn->collect_md_tun))
1094                        return -EEXIST;
1095        } else {
1096                if (ip_tunnel_find(itn, p, dev->type))
1097                        return -EEXIST;
1098        }
1099
1100        nt->net = net;
1101        nt->parms = *p;
1102        nt->fwmark = fwmark;
1103        err = register_netdevice(dev);
1104        if (err)
1105                goto out;
1106
1107        if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1108                eth_hw_addr_random(dev);
1109
1110        mtu = ip_tunnel_bind_dev(dev);
1111        if (!tb[IFLA_MTU])
1112                dev->mtu = mtu;
1113
1114        ip_tunnel_add(itn, nt);
1115out:
1116        return err;
1117}
1118EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1119
1120int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1121                         struct ip_tunnel_parm *p, __u32 fwmark)
1122{
1123        struct ip_tunnel *t;
1124        struct ip_tunnel *tunnel = netdev_priv(dev);
1125        struct net *net = tunnel->net;
1126        struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1127
1128        if (dev == itn->fb_tunnel_dev)
1129                return -EINVAL;
1130
1131        t = ip_tunnel_find(itn, p, dev->type);
1132
1133        if (t) {
1134                if (t->dev != dev)
1135                        return -EEXIST;
1136        } else {
1137                t = tunnel;
1138
1139                if (dev->type != ARPHRD_ETHER) {
1140                        unsigned int nflags = 0;
1141
1142                        if (ipv4_is_multicast(p->iph.daddr))
1143                                nflags = IFF_BROADCAST;
1144                        else if (p->iph.daddr)
1145                                nflags = IFF_POINTOPOINT;
1146
1147                        if ((dev->flags ^ nflags) &
1148                            (IFF_POINTOPOINT | IFF_BROADCAST))
1149                                return -EINVAL;
1150                }
1151        }
1152
1153        ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1154        return 0;
1155}
1156EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1157
1158int ip_tunnel_init(struct net_device *dev)
1159{
1160        struct ip_tunnel *tunnel = netdev_priv(dev);
1161        struct iphdr *iph = &tunnel->parms.iph;
1162        int err;
1163
1164        dev->needs_free_netdev = true;
1165        dev->priv_destructor = ip_tunnel_dev_free;
1166        dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1167        if (!dev->tstats)
1168                return -ENOMEM;
1169
1170        err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1171        if (err) {
1172                free_percpu(dev->tstats);
1173                return err;
1174        }
1175
1176        err = gro_cells_init(&tunnel->gro_cells, dev);
1177        if (err) {
1178                dst_cache_destroy(&tunnel->dst_cache);
1179                free_percpu(dev->tstats);
1180                return err;
1181        }
1182
1183        tunnel->dev = dev;
1184        tunnel->net = dev_net(dev);
1185        strcpy(tunnel->parms.name, dev->name);
1186        iph->version            = 4;
1187        iph->ihl                = 5;
1188
1189        if (tunnel->collect_md) {
1190                dev->features |= NETIF_F_NETNS_LOCAL;
1191                netif_keep_dst(dev);
1192        }
1193        return 0;
1194}
1195EXPORT_SYMBOL_GPL(ip_tunnel_init);
1196
1197void ip_tunnel_uninit(struct net_device *dev)
1198{
1199        struct ip_tunnel *tunnel = netdev_priv(dev);
1200        struct net *net = tunnel->net;
1201        struct ip_tunnel_net *itn;
1202
1203        itn = net_generic(net, tunnel->ip_tnl_net_id);
1204        /* fb_tunnel_dev will be unregisted in net-exit call. */
1205        if (itn->fb_tunnel_dev != dev)
1206                ip_tunnel_del(itn, netdev_priv(dev));
1207
1208        dst_cache_reset(&tunnel->dst_cache);
1209}
1210EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1211
1212/* Do least required initialization, rest of init is done in tunnel_init call */
1213void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1214{
1215        struct ip_tunnel *tunnel = netdev_priv(dev);
1216        tunnel->ip_tnl_net_id = net_id;
1217}
1218EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1219
1220MODULE_LICENSE("GPL");
1221