linux/net/ipv4/ip_tunnel.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (c) 2013 Nicira, Inc.
   4 */
   5
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7
   8#include <linux/capability.h>
   9#include <linux/module.h>
  10#include <linux/types.h>
  11#include <linux/kernel.h>
  12#include <linux/slab.h>
  13#include <linux/uaccess.h>
  14#include <linux/skbuff.h>
  15#include <linux/netdevice.h>
  16#include <linux/in.h>
  17#include <linux/tcp.h>
  18#include <linux/udp.h>
  19#include <linux/if_arp.h>
  20#include <linux/init.h>
  21#include <linux/in6.h>
  22#include <linux/inetdevice.h>
  23#include <linux/igmp.h>
  24#include <linux/netfilter_ipv4.h>
  25#include <linux/etherdevice.h>
  26#include <linux/if_ether.h>
  27#include <linux/if_vlan.h>
  28#include <linux/rculist.h>
  29#include <linux/err.h>
  30
  31#include <net/sock.h>
  32#include <net/ip.h>
  33#include <net/icmp.h>
  34#include <net/protocol.h>
  35#include <net/ip_tunnels.h>
  36#include <net/arp.h>
  37#include <net/checksum.h>
  38#include <net/dsfield.h>
  39#include <net/inet_ecn.h>
  40#include <net/xfrm.h>
  41#include <net/net_namespace.h>
  42#include <net/netns/generic.h>
  43#include <net/rtnetlink.h>
  44#include <net/udp.h>
  45#include <net/dst_metadata.h>
  46
  47#if IS_ENABLED(CONFIG_IPV6)
  48#include <net/ipv6.h>
  49#include <net/ip6_fib.h>
  50#include <net/ip6_route.h>
  51#endif
  52
  53static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
  54{
  55        return hash_32((__force u32)key ^ (__force u32)remote,
  56                         IP_TNL_HASH_BITS);
  57}
  58
  59static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
  60                                __be16 flags, __be32 key)
  61{
  62        if (p->i_flags & TUNNEL_KEY) {
  63                if (flags & TUNNEL_KEY)
  64                        return key == p->i_key;
  65                else
  66                        /* key expected, none present */
  67                        return false;
  68        } else
  69                return !(flags & TUNNEL_KEY);
  70}
  71
  72/* Fallback tunnel: no source, no destination, no key, no options
  73
  74   Tunnel hash table:
  75   We require exact key match i.e. if a key is present in packet
  76   it will match only tunnel with the same key; if it is not present,
  77   it will match only keyless tunnel.
  78
  79   All keysless packets, if not matched configured keyless tunnels
  80   will match fallback tunnel.
  81   Given src, dst and key, find appropriate for input tunnel.
  82*/
  83struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
  84                                   int link, __be16 flags,
  85                                   __be32 remote, __be32 local,
  86                                   __be32 key)
  87{
  88        unsigned int hash;
  89        struct ip_tunnel *t, *cand = NULL;
  90        struct hlist_head *head;
  91
  92        hash = ip_tunnel_hash(key, remote);
  93        head = &itn->tunnels[hash];
  94
  95        hlist_for_each_entry_rcu(t, head, hash_node) {
  96                if (local != t->parms.iph.saddr ||
  97                    remote != t->parms.iph.daddr ||
  98                    !(t->dev->flags & IFF_UP))
  99                        continue;
 100
 101                if (!ip_tunnel_key_match(&t->parms, flags, key))
 102                        continue;
 103
 104                if (t->parms.link == link)
 105                        return t;
 106                else
 107                        cand = t;
 108        }
 109
 110        hlist_for_each_entry_rcu(t, head, hash_node) {
 111                if (remote != t->parms.iph.daddr ||
 112                    t->parms.iph.saddr != 0 ||
 113                    !(t->dev->flags & IFF_UP))
 114                        continue;
 115
 116                if (!ip_tunnel_key_match(&t->parms, flags, key))
 117                        continue;
 118
 119                if (t->parms.link == link)
 120                        return t;
 121                else if (!cand)
 122                        cand = t;
 123        }
 124
 125        hash = ip_tunnel_hash(key, 0);
 126        head = &itn->tunnels[hash];
 127
 128        hlist_for_each_entry_rcu(t, head, hash_node) {
 129                if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
 130                    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
 131                        continue;
 132
 133                if (!(t->dev->flags & IFF_UP))
 134                        continue;
 135
 136                if (!ip_tunnel_key_match(&t->parms, flags, key))
 137                        continue;
 138
 139                if (t->parms.link == link)
 140                        return t;
 141                else if (!cand)
 142                        cand = t;
 143        }
 144
 145        if (flags & TUNNEL_NO_KEY)
 146                goto skip_key_lookup;
 147
 148        hlist_for_each_entry_rcu(t, head, hash_node) {
 149                if (t->parms.i_key != key ||
 150                    t->parms.iph.saddr != 0 ||
 151                    t->parms.iph.daddr != 0 ||
 152                    !(t->dev->flags & IFF_UP))
 153                        continue;
 154
 155                if (t->parms.link == link)
 156                        return t;
 157                else if (!cand)
 158                        cand = t;
 159        }
 160
 161skip_key_lookup:
 162        if (cand)
 163                return cand;
 164
 165        t = rcu_dereference(itn->collect_md_tun);
 166        if (t && t->dev->flags & IFF_UP)
 167                return t;
 168
 169        if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
 170                return netdev_priv(itn->fb_tunnel_dev);
 171
 172        return NULL;
 173}
 174EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
 175
 176static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
 177                                    struct ip_tunnel_parm *parms)
 178{
 179        unsigned int h;
 180        __be32 remote;
 181        __be32 i_key = parms->i_key;
 182
 183        if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
 184                remote = parms->iph.daddr;
 185        else
 186                remote = 0;
 187
 188        if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
 189                i_key = 0;
 190
 191        h = ip_tunnel_hash(i_key, remote);
 192        return &itn->tunnels[h];
 193}
 194
 195static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 196{
 197        struct hlist_head *head = ip_bucket(itn, &t->parms);
 198
 199        if (t->collect_md)
 200                rcu_assign_pointer(itn->collect_md_tun, t);
 201        hlist_add_head_rcu(&t->hash_node, head);
 202}
 203
 204static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 205{
 206        if (t->collect_md)
 207                rcu_assign_pointer(itn->collect_md_tun, NULL);
 208        hlist_del_init_rcu(&t->hash_node);
 209}
 210
 211static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
 212                                        struct ip_tunnel_parm *parms,
 213                                        int type)
 214{
 215        __be32 remote = parms->iph.daddr;
 216        __be32 local = parms->iph.saddr;
 217        __be32 key = parms->i_key;
 218        __be16 flags = parms->i_flags;
 219        int link = parms->link;
 220        struct ip_tunnel *t = NULL;
 221        struct hlist_head *head = ip_bucket(itn, parms);
 222
 223        hlist_for_each_entry_rcu(t, head, hash_node) {
 224                if (local == t->parms.iph.saddr &&
 225                    remote == t->parms.iph.daddr &&
 226                    link == t->parms.link &&
 227                    type == t->dev->type &&
 228                    ip_tunnel_key_match(&t->parms, flags, key))
 229                        break;
 230        }
 231        return t;
 232}
 233
 234static struct net_device *__ip_tunnel_create(struct net *net,
 235                                             const struct rtnl_link_ops *ops,
 236                                             struct ip_tunnel_parm *parms)
 237{
 238        int err;
 239        struct ip_tunnel *tunnel;
 240        struct net_device *dev;
 241        char name[IFNAMSIZ];
 242
 243        err = -E2BIG;
 244        if (parms->name[0]) {
 245                if (!dev_valid_name(parms->name))
 246                        goto failed;
 247                strlcpy(name, parms->name, IFNAMSIZ);
 248        } else {
 249                if (strlen(ops->kind) > (IFNAMSIZ - 3))
 250                        goto failed;
 251                strcpy(name, ops->kind);
 252                strcat(name, "%d");
 253        }
 254
 255        ASSERT_RTNL();
 256        dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
 257        if (!dev) {
 258                err = -ENOMEM;
 259                goto failed;
 260        }
 261        dev_net_set(dev, net);
 262
 263        dev->rtnl_link_ops = ops;
 264
 265        tunnel = netdev_priv(dev);
 266        tunnel->parms = *parms;
 267        tunnel->net = net;
 268
 269        err = register_netdevice(dev);
 270        if (err)
 271                goto failed_free;
 272
 273        return dev;
 274
 275failed_free:
 276        free_netdev(dev);
 277failed:
 278        return ERR_PTR(err);
 279}
 280
 281static int ip_tunnel_bind_dev(struct net_device *dev)
 282{
 283        struct net_device *tdev = NULL;
 284        struct ip_tunnel *tunnel = netdev_priv(dev);
 285        const struct iphdr *iph;
 286        int hlen = LL_MAX_HEADER;
 287        int mtu = ETH_DATA_LEN;
 288        int t_hlen = tunnel->hlen + sizeof(struct iphdr);
 289
 290        iph = &tunnel->parms.iph;
 291
 292        /* Guess output device to choose reasonable mtu and needed_headroom */
 293        if (iph->daddr) {
 294                struct flowi4 fl4;
 295                struct rtable *rt;
 296
 297                ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
 298                                    iph->saddr, tunnel->parms.o_key,
 299                                    RT_TOS(iph->tos), tunnel->parms.link,
 300                                    tunnel->fwmark, 0);
 301                rt = ip_route_output_key(tunnel->net, &fl4);
 302
 303                if (!IS_ERR(rt)) {
 304                        tdev = rt->dst.dev;
 305                        ip_rt_put(rt);
 306                }
 307                if (dev->type != ARPHRD_ETHER)
 308                        dev->flags |= IFF_POINTOPOINT;
 309
 310                dst_cache_reset(&tunnel->dst_cache);
 311        }
 312
 313        if (!tdev && tunnel->parms.link)
 314                tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
 315
 316        if (tdev) {
 317                hlen = tdev->hard_header_len + tdev->needed_headroom;
 318                mtu = min(tdev->mtu, IP_MAX_MTU);
 319        }
 320
 321        dev->needed_headroom = t_hlen + hlen;
 322        mtu -= (dev->hard_header_len + t_hlen);
 323
 324        if (mtu < IPV4_MIN_MTU)
 325                mtu = IPV4_MIN_MTU;
 326
 327        return mtu;
 328}
 329
 330static struct ip_tunnel *ip_tunnel_create(struct net *net,
 331                                          struct ip_tunnel_net *itn,
 332                                          struct ip_tunnel_parm *parms)
 333{
 334        struct ip_tunnel *nt;
 335        struct net_device *dev;
 336        int t_hlen;
 337        int mtu;
 338        int err;
 339
 340        dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
 341        if (IS_ERR(dev))
 342                return ERR_CAST(dev);
 343
 344        mtu = ip_tunnel_bind_dev(dev);
 345        err = dev_set_mtu(dev, mtu);
 346        if (err)
 347                goto err_dev_set_mtu;
 348
 349        nt = netdev_priv(dev);
 350        t_hlen = nt->hlen + sizeof(struct iphdr);
 351        dev->min_mtu = ETH_MIN_MTU;
 352        dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
 353        ip_tunnel_add(itn, nt);
 354        return nt;
 355
 356err_dev_set_mtu:
 357        unregister_netdevice(dev);
 358        return ERR_PTR(err);
 359}
 360
 361int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 362                  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
 363                  bool log_ecn_error)
 364{
 365        struct pcpu_sw_netstats *tstats;
 366        const struct iphdr *iph = ip_hdr(skb);
 367        int err;
 368
 369#ifdef CONFIG_NET_IPGRE_BROADCAST
 370        if (ipv4_is_multicast(iph->daddr)) {
 371                tunnel->dev->stats.multicast++;
 372                skb->pkt_type = PACKET_BROADCAST;
 373        }
 374#endif
 375
 376        if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
 377             ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
 378                tunnel->dev->stats.rx_crc_errors++;
 379                tunnel->dev->stats.rx_errors++;
 380                goto drop;
 381        }
 382
 383        if (tunnel->parms.i_flags&TUNNEL_SEQ) {
 384                if (!(tpi->flags&TUNNEL_SEQ) ||
 385                    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
 386                        tunnel->dev->stats.rx_fifo_errors++;
 387                        tunnel->dev->stats.rx_errors++;
 388                        goto drop;
 389                }
 390                tunnel->i_seqno = ntohl(tpi->seq) + 1;
 391        }
 392
 393        skb_reset_network_header(skb);
 394
 395        err = IP_ECN_decapsulate(iph, skb);
 396        if (unlikely(err)) {
 397                if (log_ecn_error)
 398                        net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
 399                                        &iph->saddr, iph->tos);
 400                if (err > 1) {
 401                        ++tunnel->dev->stats.rx_frame_errors;
 402                        ++tunnel->dev->stats.rx_errors;
 403                        goto drop;
 404                }
 405        }
 406
 407        tstats = this_cpu_ptr(tunnel->dev->tstats);
 408        u64_stats_update_begin(&tstats->syncp);
 409        tstats->rx_packets++;
 410        tstats->rx_bytes += skb->len;
 411        u64_stats_update_end(&tstats->syncp);
 412
 413        skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
 414
 415        if (tunnel->dev->type == ARPHRD_ETHER) {
 416                skb->protocol = eth_type_trans(skb, tunnel->dev);
 417                skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 418        } else {
 419                skb->dev = tunnel->dev;
 420        }
 421
 422        if (tun_dst)
 423                skb_dst_set(skb, (struct dst_entry *)tun_dst);
 424
 425        gro_cells_receive(&tunnel->gro_cells, skb);
 426        return 0;
 427
 428drop:
 429        if (tun_dst)
 430                dst_release((struct dst_entry *)tun_dst);
 431        kfree_skb(skb);
 432        return 0;
 433}
 434EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
 435
 436int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
 437                            unsigned int num)
 438{
 439        if (num >= MAX_IPTUN_ENCAP_OPS)
 440                return -ERANGE;
 441
 442        return !cmpxchg((const struct ip_tunnel_encap_ops **)
 443                        &iptun_encaps[num],
 444                        NULL, ops) ? 0 : -1;
 445}
 446EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
 447
 448int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
 449                            unsigned int num)
 450{
 451        int ret;
 452
 453        if (num >= MAX_IPTUN_ENCAP_OPS)
 454                return -ERANGE;
 455
 456        ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
 457                       &iptun_encaps[num],
 458                       ops, NULL) == ops) ? 0 : -1;
 459
 460        synchronize_net();
 461
 462        return ret;
 463}
 464EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
 465
 466int ip_tunnel_encap_setup(struct ip_tunnel *t,
 467                          struct ip_tunnel_encap *ipencap)
 468{
 469        int hlen;
 470
 471        memset(&t->encap, 0, sizeof(t->encap));
 472
 473        hlen = ip_encap_hlen(ipencap);
 474        if (hlen < 0)
 475                return hlen;
 476
 477        t->encap.type = ipencap->type;
 478        t->encap.sport = ipencap->sport;
 479        t->encap.dport = ipencap->dport;
 480        t->encap.flags = ipencap->flags;
 481
 482        t->encap_hlen = hlen;
 483        t->hlen = t->encap_hlen + t->tun_hlen;
 484
 485        return 0;
 486}
 487EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
 488
 489static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
 490                            struct rtable *rt, __be16 df,
 491                            const struct iphdr *inner_iph,
 492                            int tunnel_hlen, __be32 dst, bool md)
 493{
 494        struct ip_tunnel *tunnel = netdev_priv(dev);
 495        int pkt_size;
 496        int mtu;
 497
 498        tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
 499        pkt_size = skb->len - tunnel_hlen - dev->hard_header_len;
 500
 501        if (df)
 502                mtu = dst_mtu(&rt->dst) - dev->hard_header_len
 503                                        - sizeof(struct iphdr) - tunnel_hlen;
 504        else
 505                mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 506
 507        if (skb_valid_dst(skb))
 508                skb_dst_update_pmtu(skb, mtu);
 509
 510        if (skb->protocol == htons(ETH_P_IP)) {
 511                if (!skb_is_gso(skb) &&
 512                    (inner_iph->frag_off & htons(IP_DF)) &&
 513                    mtu < pkt_size) {
 514                        memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
 515                        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 516                        return -E2BIG;
 517                }
 518        }
 519#if IS_ENABLED(CONFIG_IPV6)
 520        else if (skb->protocol == htons(ETH_P_IPV6)) {
 521                struct rt6_info *rt6;
 522                __be32 daddr;
 523
 524                rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
 525                                           NULL;
 526                daddr = md ? dst : tunnel->parms.iph.daddr;
 527
 528                if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
 529                           mtu >= IPV6_MIN_MTU) {
 530                        if ((daddr && !ipv4_is_multicast(daddr)) ||
 531                            rt6->rt6i_dst.plen == 128) {
 532                                rt6->rt6i_flags |= RTF_MODIFIED;
 533                                dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
 534                        }
 535                }
 536
 537                if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
 538                                        mtu < pkt_size) {
 539                        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 540                        return -E2BIG;
 541                }
 542        }
 543#endif
 544        return 0;
 545}
 546
 547void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 548                       u8 proto, int tunnel_hlen)
 549{
 550        struct ip_tunnel *tunnel = netdev_priv(dev);
 551        u32 headroom = sizeof(struct iphdr);
 552        struct ip_tunnel_info *tun_info;
 553        const struct ip_tunnel_key *key;
 554        const struct iphdr *inner_iph;
 555        struct rtable *rt = NULL;
 556        struct flowi4 fl4;
 557        __be16 df = 0;
 558        u8 tos, ttl;
 559        bool use_cache;
 560
 561        tun_info = skb_tunnel_info(skb);
 562        if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
 563                     ip_tunnel_info_af(tun_info) != AF_INET))
 564                goto tx_error;
 565        key = &tun_info->key;
 566        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 567        inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
 568        tos = key->tos;
 569        if (tos == 1) {
 570                if (skb->protocol == htons(ETH_P_IP))
 571                        tos = inner_iph->tos;
 572                else if (skb->protocol == htons(ETH_P_IPV6))
 573                        tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
 574        }
 575        ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
 576                            tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
 577                            0, skb->mark, skb_get_hash(skb));
 578        if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
 579                goto tx_error;
 580
 581        use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
 582        if (use_cache)
 583                rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
 584        if (!rt) {
 585                rt = ip_route_output_key(tunnel->net, &fl4);
 586                if (IS_ERR(rt)) {
 587                        dev->stats.tx_carrier_errors++;
 588                        goto tx_error;
 589                }
 590                if (use_cache)
 591                        dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
 592                                          fl4.saddr);
 593        }
 594        if (rt->dst.dev == dev) {
 595                ip_rt_put(rt);
 596                dev->stats.collisions++;
 597                goto tx_error;
 598        }
 599
 600        if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
 601                df = htons(IP_DF);
 602        if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
 603                            key->u.ipv4.dst, true)) {
 604                ip_rt_put(rt);
 605                goto tx_error;
 606        }
 607
 608        tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
 609        ttl = key->ttl;
 610        if (ttl == 0) {
 611                if (skb->protocol == htons(ETH_P_IP))
 612                        ttl = inner_iph->ttl;
 613                else if (skb->protocol == htons(ETH_P_IPV6))
 614                        ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
 615                else
 616                        ttl = ip4_dst_hoplimit(&rt->dst);
 617        }
 618
 619        if (!df && skb->protocol == htons(ETH_P_IP))
 620                df = inner_iph->frag_off & htons(IP_DF);
 621
 622        headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
 623        if (headroom > dev->needed_headroom)
 624                dev->needed_headroom = headroom;
 625
 626        if (skb_cow_head(skb, dev->needed_headroom)) {
 627                ip_rt_put(rt);
 628                goto tx_dropped;
 629        }
 630        iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
 631                      df, !net_eq(tunnel->net, dev_net(dev)));
 632        return;
 633tx_error:
 634        dev->stats.tx_errors++;
 635        goto kfree;
 636tx_dropped:
 637        dev->stats.tx_dropped++;
 638kfree:
 639        kfree_skb(skb);
 640}
 641EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
 642
 643void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 644                    const struct iphdr *tnl_params, u8 protocol)
 645{
 646        struct ip_tunnel *tunnel = netdev_priv(dev);
 647        struct ip_tunnel_info *tun_info = NULL;
 648        const struct iphdr *inner_iph;
 649        unsigned int max_headroom;      /* The extra header space needed */
 650        struct rtable *rt = NULL;               /* Route to the other host */
 651        bool use_cache = false;
 652        struct flowi4 fl4;
 653        bool md = false;
 654        bool connected;
 655        u8 tos, ttl;
 656        __be32 dst;
 657        __be16 df;
 658
 659        inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
 660        connected = (tunnel->parms.iph.daddr != 0);
 661
 662        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 663
 664        dst = tnl_params->daddr;
 665        if (dst == 0) {
 666                /* NBMA tunnel */
 667
 668                if (!skb_dst(skb)) {
 669                        dev->stats.tx_fifo_errors++;
 670                        goto tx_error;
 671                }
 672
 673                tun_info = skb_tunnel_info(skb);
 674                if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
 675                    ip_tunnel_info_af(tun_info) == AF_INET &&
 676                    tun_info->key.u.ipv4.dst) {
 677                        dst = tun_info->key.u.ipv4.dst;
 678                        md = true;
 679                        connected = true;
 680                }
 681                else if (skb->protocol == htons(ETH_P_IP)) {
 682                        rt = skb_rtable(skb);
 683                        dst = rt_nexthop(rt, inner_iph->daddr);
 684                }
 685#if IS_ENABLED(CONFIG_IPV6)
 686                else if (skb->protocol == htons(ETH_P_IPV6)) {
 687                        const struct in6_addr *addr6;
 688                        struct neighbour *neigh;
 689                        bool do_tx_error_icmp;
 690                        int addr_type;
 691
 692                        neigh = dst_neigh_lookup(skb_dst(skb),
 693                                                 &ipv6_hdr(skb)->daddr);
 694                        if (!neigh)
 695                                goto tx_error;
 696
 697                        addr6 = (const struct in6_addr *)&neigh->primary_key;
 698                        addr_type = ipv6_addr_type(addr6);
 699
 700                        if (addr_type == IPV6_ADDR_ANY) {
 701                                addr6 = &ipv6_hdr(skb)->daddr;
 702                                addr_type = ipv6_addr_type(addr6);
 703                        }
 704
 705                        if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
 706                                do_tx_error_icmp = true;
 707                        else {
 708                                do_tx_error_icmp = false;
 709                                dst = addr6->s6_addr32[3];
 710                        }
 711                        neigh_release(neigh);
 712                        if (do_tx_error_icmp)
 713                                goto tx_error_icmp;
 714                }
 715#endif
 716                else
 717                        goto tx_error;
 718
 719                if (!md)
 720                        connected = false;
 721        }
 722
 723        tos = tnl_params->tos;
 724        if (tos & 0x1) {
 725                tos &= ~0x1;
 726                if (skb->protocol == htons(ETH_P_IP)) {
 727                        tos = inner_iph->tos;
 728                        connected = false;
 729                } else if (skb->protocol == htons(ETH_P_IPV6)) {
 730                        tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
 731                        connected = false;
 732                }
 733        }
 734
 735        ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
 736                            tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
 737                            tunnel->fwmark, skb_get_hash(skb));
 738
 739        if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
 740                goto tx_error;
 741
 742        if (connected && md) {
 743                use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
 744                if (use_cache)
 745                        rt = dst_cache_get_ip4(&tun_info->dst_cache,
 746                                               &fl4.saddr);
 747        } else {
 748                rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
 749                                                &fl4.saddr) : NULL;
 750        }
 751
 752        if (!rt) {
 753                rt = ip_route_output_key(tunnel->net, &fl4);
 754
 755                if (IS_ERR(rt)) {
 756                        dev->stats.tx_carrier_errors++;
 757                        goto tx_error;
 758                }
 759                if (use_cache)
 760                        dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
 761                                          fl4.saddr);
 762                else if (!md && connected)
 763                        dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
 764                                          fl4.saddr);
 765        }
 766
 767        if (rt->dst.dev == dev) {
 768                ip_rt_put(rt);
 769                dev->stats.collisions++;
 770                goto tx_error;
 771        }
 772
 773        if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph,
 774                            0, 0, false)) {
 775                ip_rt_put(rt);
 776                goto tx_error;
 777        }
 778
 779        if (tunnel->err_count > 0) {
 780                if (time_before(jiffies,
 781                                tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
 782                        tunnel->err_count--;
 783
 784                        dst_link_failure(skb);
 785                } else
 786                        tunnel->err_count = 0;
 787        }
 788
 789        tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
 790        ttl = tnl_params->ttl;
 791        if (ttl == 0) {
 792                if (skb->protocol == htons(ETH_P_IP))
 793                        ttl = inner_iph->ttl;
 794#if IS_ENABLED(CONFIG_IPV6)
 795                else if (skb->protocol == htons(ETH_P_IPV6))
 796                        ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
 797#endif
 798                else
 799                        ttl = ip4_dst_hoplimit(&rt->dst);
 800        }
 801
 802        df = tnl_params->frag_off;
 803        if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
 804                df |= (inner_iph->frag_off&htons(IP_DF));
 805
 806        max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
 807                        + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
 808        if (max_headroom > dev->needed_headroom)
 809                dev->needed_headroom = max_headroom;
 810
 811        if (skb_cow_head(skb, dev->needed_headroom)) {
 812                ip_rt_put(rt);
 813                dev->stats.tx_dropped++;
 814                kfree_skb(skb);
 815                return;
 816        }
 817
 818        iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
 819                      df, !net_eq(tunnel->net, dev_net(dev)));
 820        return;
 821
 822#if IS_ENABLED(CONFIG_IPV6)
 823tx_error_icmp:
 824        dst_link_failure(skb);
 825#endif
 826tx_error:
 827        dev->stats.tx_errors++;
 828        kfree_skb(skb);
 829}
 830EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
 831
 832static void ip_tunnel_update(struct ip_tunnel_net *itn,
 833                             struct ip_tunnel *t,
 834                             struct net_device *dev,
 835                             struct ip_tunnel_parm *p,
 836                             bool set_mtu,
 837                             __u32 fwmark)
 838{
 839        ip_tunnel_del(itn, t);
 840        t->parms.iph.saddr = p->iph.saddr;
 841        t->parms.iph.daddr = p->iph.daddr;
 842        t->parms.i_key = p->i_key;
 843        t->parms.o_key = p->o_key;
 844        if (dev->type != ARPHRD_ETHER) {
 845                memcpy(dev->dev_addr, &p->iph.saddr, 4);
 846                memcpy(dev->broadcast, &p->iph.daddr, 4);
 847        }
 848        ip_tunnel_add(itn, t);
 849
 850        t->parms.iph.ttl = p->iph.ttl;
 851        t->parms.iph.tos = p->iph.tos;
 852        t->parms.iph.frag_off = p->iph.frag_off;
 853
 854        if (t->parms.link != p->link || t->fwmark != fwmark) {
 855                int mtu;
 856
 857                t->parms.link = p->link;
 858                t->fwmark = fwmark;
 859                mtu = ip_tunnel_bind_dev(dev);
 860                if (set_mtu)
 861                        dev->mtu = mtu;
 862        }
 863        dst_cache_reset(&t->dst_cache);
 864        netdev_state_change(dev);
 865}
 866
 867int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
 868{
 869        int err = 0;
 870        struct ip_tunnel *t = netdev_priv(dev);
 871        struct net *net = t->net;
 872        struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
 873
 874        switch (cmd) {
 875        case SIOCGETTUNNEL:
 876                if (dev == itn->fb_tunnel_dev) {
 877                        t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 878                        if (!t)
 879                                t = netdev_priv(dev);
 880                }
 881                memcpy(p, &t->parms, sizeof(*p));
 882                break;
 883
 884        case SIOCADDTUNNEL:
 885        case SIOCCHGTUNNEL:
 886                err = -EPERM;
 887                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 888                        goto done;
 889                if (p->iph.ttl)
 890                        p->iph.frag_off |= htons(IP_DF);
 891                if (!(p->i_flags & VTI_ISVTI)) {
 892                        if (!(p->i_flags & TUNNEL_KEY))
 893                                p->i_key = 0;
 894                        if (!(p->o_flags & TUNNEL_KEY))
 895                                p->o_key = 0;
 896                }
 897
 898                t = ip_tunnel_find(itn, p, itn->type);
 899
 900                if (cmd == SIOCADDTUNNEL) {
 901                        if (!t) {
 902                                t = ip_tunnel_create(net, itn, p);
 903                                err = PTR_ERR_OR_ZERO(t);
 904                                break;
 905                        }
 906
 907                        err = -EEXIST;
 908                        break;
 909                }
 910                if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
 911                        if (t) {
 912                                if (t->dev != dev) {
 913                                        err = -EEXIST;
 914                                        break;
 915                                }
 916                        } else {
 917                                unsigned int nflags = 0;
 918
 919                                if (ipv4_is_multicast(p->iph.daddr))
 920                                        nflags = IFF_BROADCAST;
 921                                else if (p->iph.daddr)
 922                                        nflags = IFF_POINTOPOINT;
 923
 924                                if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
 925                                        err = -EINVAL;
 926                                        break;
 927                                }
 928
 929                                t = netdev_priv(dev);
 930                        }
 931                }
 932
 933                if (t) {
 934                        err = 0;
 935                        ip_tunnel_update(itn, t, dev, p, true, 0);
 936                } else {
 937                        err = -ENOENT;
 938                }
 939                break;
 940
 941        case SIOCDELTUNNEL:
 942                err = -EPERM;
 943                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 944                        goto done;
 945
 946                if (dev == itn->fb_tunnel_dev) {
 947                        err = -ENOENT;
 948                        t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 949                        if (!t)
 950                                goto done;
 951                        err = -EPERM;
 952                        if (t == netdev_priv(itn->fb_tunnel_dev))
 953                                goto done;
 954                        dev = t->dev;
 955                }
 956                unregister_netdevice(dev);
 957                err = 0;
 958                break;
 959
 960        default:
 961                err = -EINVAL;
 962        }
 963
 964done:
 965        return err;
 966}
 967EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
 968
 969int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
 970{
 971        struct ip_tunnel *tunnel = netdev_priv(dev);
 972        int t_hlen = tunnel->hlen + sizeof(struct iphdr);
 973        int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
 974
 975        if (new_mtu < ETH_MIN_MTU)
 976                return -EINVAL;
 977
 978        if (new_mtu > max_mtu) {
 979                if (strict)
 980                        return -EINVAL;
 981
 982                new_mtu = max_mtu;
 983        }
 984
 985        dev->mtu = new_mtu;
 986        return 0;
 987}
 988EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
 989
 990int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
 991{
 992        return __ip_tunnel_change_mtu(dev, new_mtu, true);
 993}
 994EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
 995
 996static void ip_tunnel_dev_free(struct net_device *dev)
 997{
 998        struct ip_tunnel *tunnel = netdev_priv(dev);
 999
1000        gro_cells_destroy(&tunnel->gro_cells);
1001        dst_cache_destroy(&tunnel->dst_cache);
1002        free_percpu(dev->tstats);
1003}
1004
1005void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1006{
1007        struct ip_tunnel *tunnel = netdev_priv(dev);
1008        struct ip_tunnel_net *itn;
1009
1010        itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1011
1012        if (itn->fb_tunnel_dev != dev) {
1013                ip_tunnel_del(itn, netdev_priv(dev));
1014                unregister_netdevice_queue(dev, head);
1015        }
1016}
1017EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1018
1019struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1020{
1021        struct ip_tunnel *tunnel = netdev_priv(dev);
1022
1023        return tunnel->net;
1024}
1025EXPORT_SYMBOL(ip_tunnel_get_link_net);
1026
1027int ip_tunnel_get_iflink(const struct net_device *dev)
1028{
1029        struct ip_tunnel *tunnel = netdev_priv(dev);
1030
1031        return tunnel->parms.link;
1032}
1033EXPORT_SYMBOL(ip_tunnel_get_iflink);
1034
1035int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1036                                  struct rtnl_link_ops *ops, char *devname)
1037{
1038        struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1039        struct ip_tunnel_parm parms;
1040        unsigned int i;
1041
1042        itn->rtnl_link_ops = ops;
1043        for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1044                INIT_HLIST_HEAD(&itn->tunnels[i]);
1045
1046        if (!ops || !net_has_fallback_tunnels(net)) {
1047                struct ip_tunnel_net *it_init_net;
1048
1049                it_init_net = net_generic(&init_net, ip_tnl_net_id);
1050                itn->type = it_init_net->type;
1051                itn->fb_tunnel_dev = NULL;
1052                return 0;
1053        }
1054
1055        memset(&parms, 0, sizeof(parms));
1056        if (devname)
1057                strlcpy(parms.name, devname, IFNAMSIZ);
1058
1059        rtnl_lock();
1060        itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1061        /* FB netdevice is special: we have one, and only one per netns.
1062         * Allowing to move it to another netns is clearly unsafe.
1063         */
1064        if (!IS_ERR(itn->fb_tunnel_dev)) {
1065                itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1066                itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1067                ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1068                itn->type = itn->fb_tunnel_dev->type;
1069        }
1070        rtnl_unlock();
1071
1072        return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1073}
1074EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1075
1076static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1077                              struct list_head *head,
1078                              struct rtnl_link_ops *ops)
1079{
1080        struct net_device *dev, *aux;
1081        int h;
1082
1083        for_each_netdev_safe(net, dev, aux)
1084                if (dev->rtnl_link_ops == ops)
1085                        unregister_netdevice_queue(dev, head);
1086
1087        for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1088                struct ip_tunnel *t;
1089                struct hlist_node *n;
1090                struct hlist_head *thead = &itn->tunnels[h];
1091
1092                hlist_for_each_entry_safe(t, n, thead, hash_node)
1093                        /* If dev is in the same netns, it has already
1094                         * been added to the list by the previous loop.
1095                         */
1096                        if (!net_eq(dev_net(t->dev), net))
1097                                unregister_netdevice_queue(t->dev, head);
1098        }
1099}
1100
1101void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1102                           struct rtnl_link_ops *ops)
1103{
1104        struct ip_tunnel_net *itn;
1105        struct net *net;
1106        LIST_HEAD(list);
1107
1108        rtnl_lock();
1109        list_for_each_entry(net, net_list, exit_list) {
1110                itn = net_generic(net, id);
1111                ip_tunnel_destroy(net, itn, &list, ops);
1112        }
1113        unregister_netdevice_many(&list);
1114        rtnl_unlock();
1115}
1116EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1117
1118int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1119                      struct ip_tunnel_parm *p, __u32 fwmark)
1120{
1121        struct ip_tunnel *nt;
1122        struct net *net = dev_net(dev);
1123        struct ip_tunnel_net *itn;
1124        int mtu;
1125        int err;
1126
1127        nt = netdev_priv(dev);
1128        itn = net_generic(net, nt->ip_tnl_net_id);
1129
1130        if (nt->collect_md) {
1131                if (rtnl_dereference(itn->collect_md_tun))
1132                        return -EEXIST;
1133        } else {
1134                if (ip_tunnel_find(itn, p, dev->type))
1135                        return -EEXIST;
1136        }
1137
1138        nt->net = net;
1139        nt->parms = *p;
1140        nt->fwmark = fwmark;
1141        err = register_netdevice(dev);
1142        if (err)
1143                goto err_register_netdevice;
1144
1145        if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1146                eth_hw_addr_random(dev);
1147
1148        mtu = ip_tunnel_bind_dev(dev);
1149        if (tb[IFLA_MTU]) {
1150                unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen;
1151
1152                mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1153                            (unsigned int)(max - sizeof(struct iphdr)));
1154        }
1155
1156        err = dev_set_mtu(dev, mtu);
1157        if (err)
1158                goto err_dev_set_mtu;
1159
1160        ip_tunnel_add(itn, nt);
1161        return 0;
1162
1163err_dev_set_mtu:
1164        unregister_netdevice(dev);
1165err_register_netdevice:
1166        return err;
1167}
1168EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1169
1170int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1171                         struct ip_tunnel_parm *p, __u32 fwmark)
1172{
1173        struct ip_tunnel *t;
1174        struct ip_tunnel *tunnel = netdev_priv(dev);
1175        struct net *net = tunnel->net;
1176        struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1177
1178        if (dev == itn->fb_tunnel_dev)
1179                return -EINVAL;
1180
1181        t = ip_tunnel_find(itn, p, dev->type);
1182
1183        if (t) {
1184                if (t->dev != dev)
1185                        return -EEXIST;
1186        } else {
1187                t = tunnel;
1188
1189                if (dev->type != ARPHRD_ETHER) {
1190                        unsigned int nflags = 0;
1191
1192                        if (ipv4_is_multicast(p->iph.daddr))
1193                                nflags = IFF_BROADCAST;
1194                        else if (p->iph.daddr)
1195                                nflags = IFF_POINTOPOINT;
1196
1197                        if ((dev->flags ^ nflags) &
1198                            (IFF_POINTOPOINT | IFF_BROADCAST))
1199                                return -EINVAL;
1200                }
1201        }
1202
1203        ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1204        return 0;
1205}
1206EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1207
1208int ip_tunnel_init(struct net_device *dev)
1209{
1210        struct ip_tunnel *tunnel = netdev_priv(dev);
1211        struct iphdr *iph = &tunnel->parms.iph;
1212        int err;
1213
1214        dev->needs_free_netdev = true;
1215        dev->priv_destructor = ip_tunnel_dev_free;
1216        dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1217        if (!dev->tstats)
1218                return -ENOMEM;
1219
1220        err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1221        if (err) {
1222                free_percpu(dev->tstats);
1223                return err;
1224        }
1225
1226        err = gro_cells_init(&tunnel->gro_cells, dev);
1227        if (err) {
1228                dst_cache_destroy(&tunnel->dst_cache);
1229                free_percpu(dev->tstats);
1230                return err;
1231        }
1232
1233        tunnel->dev = dev;
1234        tunnel->net = dev_net(dev);
1235        strcpy(tunnel->parms.name, dev->name);
1236        iph->version            = 4;
1237        iph->ihl                = 5;
1238
1239        if (tunnel->collect_md) {
1240                dev->features |= NETIF_F_NETNS_LOCAL;
1241                netif_keep_dst(dev);
1242        }
1243        return 0;
1244}
1245EXPORT_SYMBOL_GPL(ip_tunnel_init);
1246
1247void ip_tunnel_uninit(struct net_device *dev)
1248{
1249        struct ip_tunnel *tunnel = netdev_priv(dev);
1250        struct net *net = tunnel->net;
1251        struct ip_tunnel_net *itn;
1252
1253        itn = net_generic(net, tunnel->ip_tnl_net_id);
1254        /* fb_tunnel_dev will be unregisted in net-exit call. */
1255        if (itn->fb_tunnel_dev != dev)
1256                ip_tunnel_del(itn, netdev_priv(dev));
1257
1258        dst_cache_reset(&tunnel->dst_cache);
1259}
1260EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1261
1262/* Do least required initialization, rest of init is done in tunnel_init call */
1263void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1264{
1265        struct ip_tunnel *tunnel = netdev_priv(dev);
1266        tunnel->ip_tnl_net_id = net_id;
1267}
1268EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1269
1270MODULE_LICENSE("GPL");
1271