linux/net/ipv4/ip_tunnel_core.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (c) 2013 Nicira, Inc.
   4 */
   5
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7
   8#include <linux/types.h>
   9#include <linux/kernel.h>
  10#include <linux/skbuff.h>
  11#include <linux/netdevice.h>
  12#include <linux/in.h>
  13#include <linux/if_arp.h>
  14#include <linux/init.h>
  15#include <linux/in6.h>
  16#include <linux/inetdevice.h>
  17#include <linux/netfilter_ipv4.h>
  18#include <linux/etherdevice.h>
  19#include <linux/if_ether.h>
  20#include <linux/if_vlan.h>
  21#include <linux/static_key.h>
  22
  23#include <net/ip.h>
  24#include <net/icmp.h>
  25#include <net/protocol.h>
  26#include <net/ip_tunnels.h>
  27#include <net/ip6_tunnel.h>
  28#include <net/arp.h>
  29#include <net/checksum.h>
  30#include <net/dsfield.h>
  31#include <net/inet_ecn.h>
  32#include <net/xfrm.h>
  33#include <net/net_namespace.h>
  34#include <net/netns/generic.h>
  35#include <net/rtnetlink.h>
  36#include <net/dst_metadata.h>
  37
  38const struct ip_tunnel_encap_ops __rcu *
  39                iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
  40EXPORT_SYMBOL(iptun_encaps);
  41
  42const struct ip6_tnl_encap_ops __rcu *
  43                ip6tun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
  44EXPORT_SYMBOL(ip6tun_encaps);
  45
  46void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
  47                   __be32 src, __be32 dst, __u8 proto,
  48                   __u8 tos, __u8 ttl, __be16 df, bool xnet)
  49{
  50        int pkt_len = skb->len - skb_inner_network_offset(skb);
  51        struct net *net = dev_net(rt->dst.dev);
  52        struct net_device *dev = skb->dev;
  53        struct iphdr *iph;
  54        int err;
  55
  56        skb_scrub_packet(skb, xnet);
  57
  58        skb_clear_hash_if_not_l4(skb);
  59        skb_dst_set(skb, &rt->dst);
  60        memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
  61
  62        /* Push down and install the IP header. */
  63        skb_push(skb, sizeof(struct iphdr));
  64        skb_reset_network_header(skb);
  65
  66        iph = ip_hdr(skb);
  67
  68        iph->version    =       4;
  69        iph->ihl        =       sizeof(struct iphdr) >> 2;
  70        iph->frag_off   =       ip_mtu_locked(&rt->dst) ? 0 : df;
  71        iph->protocol   =       proto;
  72        iph->tos        =       tos;
  73        iph->daddr      =       dst;
  74        iph->saddr      =       src;
  75        iph->ttl        =       ttl;
  76        __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1);
  77
  78        err = ip_local_out(net, sk, skb);
  79
  80        if (dev) {
  81                if (unlikely(net_xmit_eval(err)))
  82                        pkt_len = 0;
  83                iptunnel_xmit_stats(dev, pkt_len);
  84        }
  85}
  86EXPORT_SYMBOL_GPL(iptunnel_xmit);
  87
  88int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
  89                           __be16 inner_proto, bool raw_proto, bool xnet)
  90{
  91        if (unlikely(!pskb_may_pull(skb, hdr_len)))
  92                return -ENOMEM;
  93
  94        skb_pull_rcsum(skb, hdr_len);
  95
  96        if (!raw_proto && inner_proto == htons(ETH_P_TEB)) {
  97                struct ethhdr *eh;
  98
  99                if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
 100                        return -ENOMEM;
 101
 102                eh = (struct ethhdr *)skb->data;
 103                if (likely(eth_proto_is_802_3(eh->h_proto)))
 104                        skb->protocol = eh->h_proto;
 105                else
 106                        skb->protocol = htons(ETH_P_802_2);
 107
 108        } else {
 109                skb->protocol = inner_proto;
 110        }
 111
 112        skb_clear_hash_if_not_l4(skb);
 113        __vlan_hwaccel_clear_tag(skb);
 114        skb_set_queue_mapping(skb, 0);
 115        skb_scrub_packet(skb, xnet);
 116
 117        return iptunnel_pull_offloads(skb);
 118}
 119EXPORT_SYMBOL_GPL(__iptunnel_pull_header);
 120
 121struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
 122                                             gfp_t flags)
 123{
 124        struct metadata_dst *res;
 125        struct ip_tunnel_info *dst, *src;
 126
 127        if (!md || md->type != METADATA_IP_TUNNEL ||
 128            md->u.tun_info.mode & IP_TUNNEL_INFO_TX)
 129
 130                return NULL;
 131
 132        res = metadata_dst_alloc(0, METADATA_IP_TUNNEL, flags);
 133        if (!res)
 134                return NULL;
 135
 136        dst = &res->u.tun_info;
 137        src = &md->u.tun_info;
 138        dst->key.tun_id = src->key.tun_id;
 139        if (src->mode & IP_TUNNEL_INFO_IPV6)
 140                memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src,
 141                       sizeof(struct in6_addr));
 142        else
 143                dst->key.u.ipv4.dst = src->key.u.ipv4.src;
 144        dst->key.tun_flags = src->key.tun_flags;
 145        dst->mode = src->mode | IP_TUNNEL_INFO_TX;
 146
 147        return res;
 148}
 149EXPORT_SYMBOL_GPL(iptunnel_metadata_reply);
 150
 151int iptunnel_handle_offloads(struct sk_buff *skb,
 152                             int gso_type_mask)
 153{
 154        int err;
 155
 156        if (likely(!skb->encapsulation)) {
 157                skb_reset_inner_headers(skb);
 158                skb->encapsulation = 1;
 159        }
 160
 161        if (skb_is_gso(skb)) {
 162                err = skb_header_unclone(skb, GFP_ATOMIC);
 163                if (unlikely(err))
 164                        return err;
 165                skb_shinfo(skb)->gso_type |= gso_type_mask;
 166                return 0;
 167        }
 168
 169        if (skb->ip_summed != CHECKSUM_PARTIAL) {
 170                skb->ip_summed = CHECKSUM_NONE;
 171                /* We clear encapsulation here to prevent badly-written
 172                 * drivers potentially deciding to offload an inner checksum
 173                 * if we set CHECKSUM_PARTIAL on the outer header.
 174                 * This should go away when the drivers are all fixed.
 175                 */
 176                skb->encapsulation = 0;
 177        }
 178
 179        return 0;
 180}
 181EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
 182
 183/* Often modified stats are per cpu, other are shared (netdev->stats) */
 184void ip_tunnel_get_stats64(struct net_device *dev,
 185                           struct rtnl_link_stats64 *tot)
 186{
 187        int i;
 188
 189        netdev_stats_to_stats64(tot, &dev->stats);
 190
 191        for_each_possible_cpu(i) {
 192                const struct pcpu_sw_netstats *tstats =
 193                                                   per_cpu_ptr(dev->tstats, i);
 194                u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
 195                unsigned int start;
 196
 197                do {
 198                        start = u64_stats_fetch_begin_irq(&tstats->syncp);
 199                        rx_packets = tstats->rx_packets;
 200                        tx_packets = tstats->tx_packets;
 201                        rx_bytes = tstats->rx_bytes;
 202                        tx_bytes = tstats->tx_bytes;
 203                } while (u64_stats_fetch_retry_irq(&tstats->syncp, start));
 204
 205                tot->rx_packets += rx_packets;
 206                tot->tx_packets += tx_packets;
 207                tot->rx_bytes   += rx_bytes;
 208                tot->tx_bytes   += tx_bytes;
 209        }
 210}
 211EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
 212
 213static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
 214        [LWTUNNEL_IP_ID]        = { .type = NLA_U64 },
 215        [LWTUNNEL_IP_DST]       = { .type = NLA_U32 },
 216        [LWTUNNEL_IP_SRC]       = { .type = NLA_U32 },
 217        [LWTUNNEL_IP_TTL]       = { .type = NLA_U8 },
 218        [LWTUNNEL_IP_TOS]       = { .type = NLA_U8 },
 219        [LWTUNNEL_IP_FLAGS]     = { .type = NLA_U16 },
 220};
 221
 222static int ip_tun_build_state(struct nlattr *attr,
 223                              unsigned int family, const void *cfg,
 224                              struct lwtunnel_state **ts,
 225                              struct netlink_ext_ack *extack)
 226{
 227        struct ip_tunnel_info *tun_info;
 228        struct lwtunnel_state *new_state;
 229        struct nlattr *tb[LWTUNNEL_IP_MAX + 1];
 230        int err;
 231
 232        err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_MAX, attr,
 233                                          ip_tun_policy, extack);
 234        if (err < 0)
 235                return err;
 236
 237        new_state = lwtunnel_state_alloc(sizeof(*tun_info));
 238        if (!new_state)
 239                return -ENOMEM;
 240
 241        new_state->type = LWTUNNEL_ENCAP_IP;
 242
 243        tun_info = lwt_tun_info(new_state);
 244
 245#ifdef CONFIG_DST_CACHE
 246        err = dst_cache_init(&tun_info->dst_cache, GFP_KERNEL);
 247        if (err) {
 248                lwtstate_free(new_state);
 249                return err;
 250        }
 251#endif
 252
 253        if (tb[LWTUNNEL_IP_ID])
 254                tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP_ID]);
 255
 256        if (tb[LWTUNNEL_IP_DST])
 257                tun_info->key.u.ipv4.dst = nla_get_in_addr(tb[LWTUNNEL_IP_DST]);
 258
 259        if (tb[LWTUNNEL_IP_SRC])
 260                tun_info->key.u.ipv4.src = nla_get_in_addr(tb[LWTUNNEL_IP_SRC]);
 261
 262        if (tb[LWTUNNEL_IP_TTL])
 263                tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]);
 264
 265        if (tb[LWTUNNEL_IP_TOS])
 266                tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]);
 267
 268        if (tb[LWTUNNEL_IP_FLAGS])
 269                tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP_FLAGS]);
 270
 271        tun_info->mode = IP_TUNNEL_INFO_TX;
 272        tun_info->options_len = 0;
 273
 274        *ts = new_state;
 275
 276        return 0;
 277}
 278
 279static void ip_tun_destroy_state(struct lwtunnel_state *lwtstate)
 280{
 281#ifdef CONFIG_DST_CACHE
 282        struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
 283
 284        dst_cache_destroy(&tun_info->dst_cache);
 285#endif
 286}
 287
 288static int ip_tun_fill_encap_info(struct sk_buff *skb,
 289                                  struct lwtunnel_state *lwtstate)
 290{
 291        struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
 292
 293        if (nla_put_be64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id,
 294                         LWTUNNEL_IP_PAD) ||
 295            nla_put_in_addr(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) ||
 296            nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
 297            nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||
 298            nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) ||
 299            nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags))
 300                return -ENOMEM;
 301
 302        return 0;
 303}
 304
 305static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
 306{
 307        return nla_total_size_64bit(8)  /* LWTUNNEL_IP_ID */
 308                + nla_total_size(4)     /* LWTUNNEL_IP_DST */
 309                + nla_total_size(4)     /* LWTUNNEL_IP_SRC */
 310                + nla_total_size(1)     /* LWTUNNEL_IP_TOS */
 311                + nla_total_size(1)     /* LWTUNNEL_IP_TTL */
 312                + nla_total_size(2);    /* LWTUNNEL_IP_FLAGS */
 313}
 314
 315static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
 316{
 317        return memcmp(lwt_tun_info(a), lwt_tun_info(b),
 318                      sizeof(struct ip_tunnel_info));
 319}
 320
 321static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
 322        .build_state = ip_tun_build_state,
 323        .destroy_state = ip_tun_destroy_state,
 324        .fill_encap = ip_tun_fill_encap_info,
 325        .get_encap_size = ip_tun_encap_nlsize,
 326        .cmp_encap = ip_tun_cmp_encap,
 327        .owner = THIS_MODULE,
 328};
 329
 330static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
 331        [LWTUNNEL_IP6_ID]               = { .type = NLA_U64 },
 332        [LWTUNNEL_IP6_DST]              = { .len = sizeof(struct in6_addr) },
 333        [LWTUNNEL_IP6_SRC]              = { .len = sizeof(struct in6_addr) },
 334        [LWTUNNEL_IP6_HOPLIMIT]         = { .type = NLA_U8 },
 335        [LWTUNNEL_IP6_TC]               = { .type = NLA_U8 },
 336        [LWTUNNEL_IP6_FLAGS]            = { .type = NLA_U16 },
 337};
 338
 339static int ip6_tun_build_state(struct nlattr *attr,
 340                               unsigned int family, const void *cfg,
 341                               struct lwtunnel_state **ts,
 342                               struct netlink_ext_ack *extack)
 343{
 344        struct ip_tunnel_info *tun_info;
 345        struct lwtunnel_state *new_state;
 346        struct nlattr *tb[LWTUNNEL_IP6_MAX + 1];
 347        int err;
 348
 349        err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP6_MAX, attr,
 350                                          ip6_tun_policy, extack);
 351        if (err < 0)
 352                return err;
 353
 354        new_state = lwtunnel_state_alloc(sizeof(*tun_info));
 355        if (!new_state)
 356                return -ENOMEM;
 357
 358        new_state->type = LWTUNNEL_ENCAP_IP6;
 359
 360        tun_info = lwt_tun_info(new_state);
 361
 362        if (tb[LWTUNNEL_IP6_ID])
 363                tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP6_ID]);
 364
 365        if (tb[LWTUNNEL_IP6_DST])
 366                tun_info->key.u.ipv6.dst = nla_get_in6_addr(tb[LWTUNNEL_IP6_DST]);
 367
 368        if (tb[LWTUNNEL_IP6_SRC])
 369                tun_info->key.u.ipv6.src = nla_get_in6_addr(tb[LWTUNNEL_IP6_SRC]);
 370
 371        if (tb[LWTUNNEL_IP6_HOPLIMIT])
 372                tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP6_HOPLIMIT]);
 373
 374        if (tb[LWTUNNEL_IP6_TC])
 375                tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]);
 376
 377        if (tb[LWTUNNEL_IP6_FLAGS])
 378                tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]);
 379
 380        tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6;
 381        tun_info->options_len = 0;
 382
 383        *ts = new_state;
 384
 385        return 0;
 386}
 387
 388static int ip6_tun_fill_encap_info(struct sk_buff *skb,
 389                                   struct lwtunnel_state *lwtstate)
 390{
 391        struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
 392
 393        if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id,
 394                         LWTUNNEL_IP6_PAD) ||
 395            nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) ||
 396            nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||
 397            nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) ||
 398            nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) ||
 399            nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags))
 400                return -ENOMEM;
 401
 402        return 0;
 403}
 404
 405static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
 406{
 407        return nla_total_size_64bit(8)  /* LWTUNNEL_IP6_ID */
 408                + nla_total_size(16)    /* LWTUNNEL_IP6_DST */
 409                + nla_total_size(16)    /* LWTUNNEL_IP6_SRC */
 410                + nla_total_size(1)     /* LWTUNNEL_IP6_HOPLIMIT */
 411                + nla_total_size(1)     /* LWTUNNEL_IP6_TC */
 412                + nla_total_size(2);    /* LWTUNNEL_IP6_FLAGS */
 413}
 414
 415static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
 416        .build_state = ip6_tun_build_state,
 417        .fill_encap = ip6_tun_fill_encap_info,
 418        .get_encap_size = ip6_tun_encap_nlsize,
 419        .cmp_encap = ip_tun_cmp_encap,
 420        .owner = THIS_MODULE,
 421};
 422
 423void __init ip_tunnel_core_init(void)
 424{
 425        /* If you land here, make sure whether increasing ip_tunnel_info's
 426         * options_len is a reasonable choice with its usage in front ends
 427         * (f.e., it's part of flow keys, etc).
 428         */
 429        BUILD_BUG_ON(IP_TUNNEL_OPTS_MAX != 255);
 430
 431        lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
 432        lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6);
 433}
 434
 435DEFINE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt);
 436EXPORT_SYMBOL(ip_tunnel_metadata_cnt);
 437
 438void ip_tunnel_need_metadata(void)
 439{
 440        static_branch_inc(&ip_tunnel_metadata_cnt);
 441}
 442EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata);
 443
 444void ip_tunnel_unneed_metadata(void)
 445{
 446        static_branch_dec(&ip_tunnel_metadata_cnt);
 447}
 448EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
 449