linux/net/netfilter/ipvs/ip_vs_xmit.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * ip_vs_xmit.c: various packet transmitters for IPVS
   4 *
   5 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
   6 *              Julian Anastasov <ja@ssi.bg>
   7 *
   8 * Changes:
   9 *
  10 * Description of forwarding methods:
  11 * - all transmitters are called from LOCAL_IN (remote clients) and
  12 * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
  13 * - not all connections have destination server, for example,
  14 * connections in backup server when fwmark is used
  15 * - bypass connections use daddr from packet
  16 * - we can use dst without ref while sending in RCU section, we use
  17 * ref when returning NF_ACCEPT for NAT-ed packet via loopback
  18 * LOCAL_OUT rules:
  19 * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
  20 * - skb->pkt_type is not set yet
  21 * - the only place where we can see skb->sk != NULL
  22 */
  23
  24#define KMSG_COMPONENT "IPVS"
  25#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  26
  27#include <linux/kernel.h>
  28#include <linux/slab.h>
  29#include <linux/tcp.h>                  /* for tcphdr */
  30#include <net/ip.h>
  31#include <net/gue.h>
  32#include <net/gre.h>
  33#include <net/tcp.h>                    /* for csum_tcpudp_magic */
  34#include <net/udp.h>
  35#include <net/icmp.h>                   /* for icmp_send */
  36#include <net/route.h>                  /* for ip_route_output */
  37#include <net/ipv6.h>
  38#include <net/ip6_route.h>
  39#include <net/ip_tunnels.h>
  40#include <net/ip6_checksum.h>
  41#include <net/addrconf.h>
  42#include <linux/icmpv6.h>
  43#include <linux/netfilter.h>
  44#include <linux/netfilter_ipv4.h>
  45
  46#include <net/ip_vs.h>
  47
  48enum {
  49        IP_VS_RT_MODE_LOCAL     = 1, /* Allow local dest */
  50        IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */
  51        IP_VS_RT_MODE_RDR       = 4, /* Allow redirect from remote daddr to
  52                                      * local
  53                                      */
  54        IP_VS_RT_MODE_CONNECT   = 8, /* Always bind route to saddr */
  55        IP_VS_RT_MODE_KNOWN_NH  = 16,/* Route via remote addr */
  56        IP_VS_RT_MODE_TUNNEL    = 32,/* Tunnel mode */
  57};
  58
  59static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void)
  60{
  61        return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC);
  62}
  63
  64static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst)
  65{
  66        kfree(dest_dst);
  67}
  68
  69/*
  70 *      Destination cache to speed up outgoing route lookup
  71 */
  72static inline void
  73__ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst,
  74                struct dst_entry *dst, u32 dst_cookie)
  75{
  76        struct ip_vs_dest_dst *old;
  77
  78        old = rcu_dereference_protected(dest->dest_dst,
  79                                        lockdep_is_held(&dest->dst_lock));
  80
  81        if (dest_dst) {
  82                dest_dst->dst_cache = dst;
  83                dest_dst->dst_cookie = dst_cookie;
  84        }
  85        rcu_assign_pointer(dest->dest_dst, dest_dst);
  86
  87        if (old)
  88                call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
  89}
  90
  91static inline struct ip_vs_dest_dst *
  92__ip_vs_dst_check(struct ip_vs_dest *dest)
  93{
  94        struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst);
  95        struct dst_entry *dst;
  96
  97        if (!dest_dst)
  98                return NULL;
  99        dst = dest_dst->dst_cache;
 100        if (dst->obsolete &&
 101            dst->ops->check(dst, dest_dst->dst_cookie) == NULL)
 102                return NULL;
 103        return dest_dst;
 104}
 105
 106static inline bool
 107__mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
 108{
 109        if (IP6CB(skb)->frag_max_size) {
 110                /* frag_max_size tell us that, this packet have been
 111                 * defragmented by netfilter IPv6 conntrack module.
 112                 */
 113                if (IP6CB(skb)->frag_max_size > mtu)
 114                        return true; /* largest fragment violate MTU */
 115        }
 116        else if (skb->len > mtu && !skb_is_gso(skb)) {
 117                return true; /* Packet size violate MTU size */
 118        }
 119        return false;
 120}
 121
 122/* Get route to daddr, update *saddr, optionally bind route to saddr */
 123static struct rtable *do_output_route4(struct net *net, __be32 daddr,
 124                                       int rt_mode, __be32 *saddr)
 125{
 126        struct flowi4 fl4;
 127        struct rtable *rt;
 128        bool loop = false;
 129
 130        memset(&fl4, 0, sizeof(fl4));
 131        fl4.daddr = daddr;
 132        fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ?
 133                           FLOWI_FLAG_KNOWN_NH : 0;
 134
 135retry:
 136        rt = ip_route_output_key(net, &fl4);
 137        if (IS_ERR(rt)) {
 138                /* Invalid saddr ? */
 139                if (PTR_ERR(rt) == -EINVAL && *saddr &&
 140                    rt_mode & IP_VS_RT_MODE_CONNECT && !loop) {
 141                        *saddr = 0;
 142                        flowi4_update_output(&fl4, 0, 0, daddr, 0);
 143                        goto retry;
 144                }
 145                IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
 146                return NULL;
 147        } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
 148                ip_rt_put(rt);
 149                *saddr = fl4.saddr;
 150                flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr);
 151                loop = true;
 152                goto retry;
 153        }
 154        *saddr = fl4.saddr;
 155        return rt;
 156}
 157
 158#ifdef CONFIG_IP_VS_IPV6
 159static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
 160{
 161        return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK;
 162}
 163#endif
 164
 165static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb,
 166                                                int rt_mode,
 167                                                bool new_rt_is_local)
 168{
 169        bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL);
 170        bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_NON_LOCAL);
 171        bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR);
 172        bool source_is_loopback;
 173        bool old_rt_is_local;
 174
 175#ifdef CONFIG_IP_VS_IPV6
 176        if (skb_af == AF_INET6) {
 177                int addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr);
 178
 179                source_is_loopback =
 180                        (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
 181                        (addr_type & IPV6_ADDR_LOOPBACK);
 182                old_rt_is_local = __ip_vs_is_local_route6(
 183                        (struct rt6_info *)skb_dst(skb));
 184        } else
 185#endif
 186        {
 187                source_is_loopback = ipv4_is_loopback(ip_hdr(skb)->saddr);
 188                old_rt_is_local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
 189        }
 190
 191        if (unlikely(new_rt_is_local)) {
 192                if (!rt_mode_allow_local)
 193                        return true;
 194                if (!rt_mode_allow_redirect && !old_rt_is_local)
 195                        return true;
 196        } else {
 197                if (!rt_mode_allow_non_local)
 198                        return true;
 199                if (source_is_loopback)
 200                        return true;
 201        }
 202        return false;
 203}
 204
 205static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu)
 206{
 207        struct sock *sk = skb->sk;
 208        struct rtable *ort = skb_rtable(skb);
 209
 210        if (!skb->dev && sk && sk_fullsock(sk))
 211                ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu, true);
 212}
 213
 214static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af,
 215                                          int rt_mode,
 216                                          struct ip_vs_iphdr *ipvsh,
 217                                          struct sk_buff *skb, int mtu)
 218{
 219#ifdef CONFIG_IP_VS_IPV6
 220        if (skb_af == AF_INET6) {
 221                struct net *net = ipvs->net;
 222
 223                if (unlikely(__mtu_check_toobig_v6(skb, mtu))) {
 224                        if (!skb->dev)
 225                                skb->dev = net->loopback_dev;
 226                        /* only send ICMP too big on first fragment */
 227                        if (!ipvsh->fragoffs && !ip_vs_iph_icmp(ipvsh))
 228                                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 229                        IP_VS_DBG(1, "frag needed for %pI6c\n",
 230                                  &ipv6_hdr(skb)->saddr);
 231                        return false;
 232                }
 233        } else
 234#endif
 235        {
 236                /* If we're going to tunnel the packet and pmtu discovery
 237                 * is disabled, we'll just fragment it anyway
 238                 */
 239                if ((rt_mode & IP_VS_RT_MODE_TUNNEL) && !sysctl_pmtu_disc(ipvs))
 240                        return true;
 241
 242                if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) &&
 243                             skb->len > mtu && !skb_is_gso(skb) &&
 244                             !ip_vs_iph_icmp(ipvsh))) {
 245                        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 246                                  htonl(mtu));
 247                        IP_VS_DBG(1, "frag needed for %pI4\n",
 248                                  &ip_hdr(skb)->saddr);
 249                        return false;
 250                }
 251        }
 252
 253        return true;
 254}
 255
 256static inline bool decrement_ttl(struct netns_ipvs *ipvs,
 257                                 int skb_af,
 258                                 struct sk_buff *skb)
 259{
 260        struct net *net = ipvs->net;
 261
 262#ifdef CONFIG_IP_VS_IPV6
 263        if (skb_af == AF_INET6) {
 264                struct dst_entry *dst = skb_dst(skb);
 265
 266                /* check and decrement ttl */
 267                if (ipv6_hdr(skb)->hop_limit <= 1) {
 268                        struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
 269
 270                        /* Force OUTPUT device used as source address */
 271                        skb->dev = dst->dev;
 272                        icmpv6_send(skb, ICMPV6_TIME_EXCEED,
 273                                    ICMPV6_EXC_HOPLIMIT, 0);
 274                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 275
 276                        return false;
 277                }
 278
 279                /* don't propagate ttl change to cloned packets */
 280                if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
 281                        return false;
 282
 283                ipv6_hdr(skb)->hop_limit--;
 284        } else
 285#endif
 286        {
 287                if (ip_hdr(skb)->ttl <= 1) {
 288                        /* Tell the sender its packet died... */
 289                        __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
 290                        icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
 291                        return false;
 292                }
 293
 294                /* don't propagate ttl change to cloned packets */
 295                if (skb_ensure_writable(skb, sizeof(struct iphdr)))
 296                        return false;
 297
 298                /* Decrease ttl */
 299                ip_decrease_ttl(ip_hdr(skb));
 300        }
 301
 302        return true;
 303}
 304
 305/* Get route to destination or remote server */
 306static int
 307__ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
 308                   struct ip_vs_dest *dest,
 309                   __be32 daddr, int rt_mode, __be32 *ret_saddr,
 310                   struct ip_vs_iphdr *ipvsh)
 311{
 312        struct net *net = ipvs->net;
 313        struct ip_vs_dest_dst *dest_dst;
 314        struct rtable *rt;                      /* Route to the other host */
 315        int mtu;
 316        int local, noref = 1;
 317
 318        if (dest) {
 319                dest_dst = __ip_vs_dst_check(dest);
 320                if (likely(dest_dst))
 321                        rt = (struct rtable *) dest_dst->dst_cache;
 322                else {
 323                        dest_dst = ip_vs_dest_dst_alloc();
 324                        spin_lock_bh(&dest->dst_lock);
 325                        if (!dest_dst) {
 326                                __ip_vs_dst_set(dest, NULL, NULL, 0);
 327                                spin_unlock_bh(&dest->dst_lock);
 328                                goto err_unreach;
 329                        }
 330                        rt = do_output_route4(net, dest->addr.ip, rt_mode,
 331                                              &dest_dst->dst_saddr.ip);
 332                        if (!rt) {
 333                                __ip_vs_dst_set(dest, NULL, NULL, 0);
 334                                spin_unlock_bh(&dest->dst_lock);
 335                                ip_vs_dest_dst_free(dest_dst);
 336                                goto err_unreach;
 337                        }
 338                        __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0);
 339                        spin_unlock_bh(&dest->dst_lock);
 340                        IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
 341                                  &dest->addr.ip, &dest_dst->dst_saddr.ip,
 342                                  atomic_read(&rt->dst.__refcnt));
 343                }
 344                if (ret_saddr)
 345                        *ret_saddr = dest_dst->dst_saddr.ip;
 346        } else {
 347                __be32 saddr = htonl(INADDR_ANY);
 348
 349                noref = 0;
 350
 351                /* For such unconfigured boxes avoid many route lookups
 352                 * for performance reasons because we do not remember saddr
 353                 */
 354                rt_mode &= ~IP_VS_RT_MODE_CONNECT;
 355                rt = do_output_route4(net, daddr, rt_mode, &saddr);
 356                if (!rt)
 357                        goto err_unreach;
 358                if (ret_saddr)
 359                        *ret_saddr = saddr;
 360        }
 361
 362        local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0;
 363        if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
 364                                                  local))) {
 365                IP_VS_DBG_RL("We are crossing local and non-local addresses"
 366                             " daddr=%pI4\n", &daddr);
 367                goto err_put;
 368        }
 369
 370        if (unlikely(local)) {
 371                /* skb to local stack, preserve old route */
 372                if (!noref)
 373                        ip_rt_put(rt);
 374                return local;
 375        }
 376
 377        if (!decrement_ttl(ipvs, skb_af, skb))
 378                goto err_put;
 379
 380        if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) {
 381                mtu = dst_mtu(&rt->dst);
 382        } else {
 383                mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
 384                if (!dest)
 385                        goto err_put;
 386                if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
 387                        mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
 388                        if ((dest->tun_flags &
 389                             IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
 390                            skb->ip_summed == CHECKSUM_PARTIAL)
 391                                mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
 392                } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
 393                        __be16 tflags = 0;
 394
 395                        if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
 396                                tflags |= TUNNEL_CSUM;
 397                        mtu -= gre_calc_hlen(tflags);
 398                }
 399                if (mtu < 68) {
 400                        IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
 401                        goto err_put;
 402                }
 403                maybe_update_pmtu(skb_af, skb, mtu);
 404        }
 405
 406        if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu))
 407                goto err_put;
 408
 409        skb_dst_drop(skb);
 410        if (noref)
 411                skb_dst_set_noref(skb, &rt->dst);
 412        else
 413                skb_dst_set(skb, &rt->dst);
 414
 415        return local;
 416
 417err_put:
 418        if (!noref)
 419                ip_rt_put(rt);
 420        return -1;
 421
 422err_unreach:
 423        dst_link_failure(skb);
 424        return -1;
 425}
 426
 427#ifdef CONFIG_IP_VS_IPV6
 428static struct dst_entry *
 429__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
 430                        struct in6_addr *ret_saddr, int do_xfrm, int rt_mode)
 431{
 432        struct dst_entry *dst;
 433        struct flowi6 fl6 = {
 434                .daddr = *daddr,
 435        };
 436
 437        if (rt_mode & IP_VS_RT_MODE_KNOWN_NH)
 438                fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;
 439
 440        dst = ip6_route_output(net, NULL, &fl6);
 441        if (dst->error)
 442                goto out_err;
 443        if (!ret_saddr)
 444                return dst;
 445        if (ipv6_addr_any(&fl6.saddr) &&
 446            ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
 447                               &fl6.daddr, 0, &fl6.saddr) < 0)
 448                goto out_err;
 449        if (do_xfrm) {
 450                dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
 451                if (IS_ERR(dst)) {
 452                        dst = NULL;
 453                        goto out_err;
 454                }
 455        }
 456        *ret_saddr = fl6.saddr;
 457        return dst;
 458
 459out_err:
 460        dst_release(dst);
 461        IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
 462        return NULL;
 463}
 464
 465/*
 466 * Get route to destination or remote server
 467 */
 468static int
 469__ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
 470                      struct ip_vs_dest *dest,
 471                      struct in6_addr *daddr, struct in6_addr *ret_saddr,
 472                      struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode)
 473{
 474        struct net *net = ipvs->net;
 475        struct ip_vs_dest_dst *dest_dst;
 476        struct rt6_info *rt;                    /* Route to the other host */
 477        struct dst_entry *dst;
 478        int mtu;
 479        int local, noref = 1;
 480
 481        if (dest) {
 482                dest_dst = __ip_vs_dst_check(dest);
 483                if (likely(dest_dst))
 484                        rt = (struct rt6_info *) dest_dst->dst_cache;
 485                else {
 486                        u32 cookie;
 487
 488                        dest_dst = ip_vs_dest_dst_alloc();
 489                        spin_lock_bh(&dest->dst_lock);
 490                        if (!dest_dst) {
 491                                __ip_vs_dst_set(dest, NULL, NULL, 0);
 492                                spin_unlock_bh(&dest->dst_lock);
 493                                goto err_unreach;
 494                        }
 495                        dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
 496                                                      &dest_dst->dst_saddr.in6,
 497                                                      do_xfrm, rt_mode);
 498                        if (!dst) {
 499                                __ip_vs_dst_set(dest, NULL, NULL, 0);
 500                                spin_unlock_bh(&dest->dst_lock);
 501                                ip_vs_dest_dst_free(dest_dst);
 502                                goto err_unreach;
 503                        }
 504                        rt = (struct rt6_info *) dst;
 505                        cookie = rt6_get_cookie(rt);
 506                        __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
 507                        spin_unlock_bh(&dest->dst_lock);
 508                        IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
 509                                  &dest->addr.in6, &dest_dst->dst_saddr.in6,
 510                                  atomic_read(&rt->dst.__refcnt));
 511                }
 512                if (ret_saddr)
 513                        *ret_saddr = dest_dst->dst_saddr.in6;
 514        } else {
 515                noref = 0;
 516                dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm,
 517                                              rt_mode);
 518                if (!dst)
 519                        goto err_unreach;
 520                rt = (struct rt6_info *) dst;
 521        }
 522
 523        local = __ip_vs_is_local_route6(rt);
 524
 525        if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
 526                                                  local))) {
 527                IP_VS_DBG_RL("We are crossing local and non-local addresses"
 528                             " daddr=%pI6\n", daddr);
 529                goto err_put;
 530        }
 531
 532        if (unlikely(local)) {
 533                /* skb to local stack, preserve old route */
 534                if (!noref)
 535                        dst_release(&rt->dst);
 536                return local;
 537        }
 538
 539        if (!decrement_ttl(ipvs, skb_af, skb))
 540                goto err_put;
 541
 542        /* MTU checking */
 543        if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL)))
 544                mtu = dst_mtu(&rt->dst);
 545        else {
 546                mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
 547                if (!dest)
 548                        goto err_put;
 549                if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
 550                        mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
 551                        if ((dest->tun_flags &
 552                             IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
 553                            skb->ip_summed == CHECKSUM_PARTIAL)
 554                                mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
 555                } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
 556                        __be16 tflags = 0;
 557
 558                        if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
 559                                tflags |= TUNNEL_CSUM;
 560                        mtu -= gre_calc_hlen(tflags);
 561                }
 562                if (mtu < IPV6_MIN_MTU) {
 563                        IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
 564                                     IPV6_MIN_MTU);
 565                        goto err_put;
 566                }
 567                maybe_update_pmtu(skb_af, skb, mtu);
 568        }
 569
 570        if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu))
 571                goto err_put;
 572
 573        skb_dst_drop(skb);
 574        if (noref)
 575                skb_dst_set_noref(skb, &rt->dst);
 576        else
 577                skb_dst_set(skb, &rt->dst);
 578
 579        return local;
 580
 581err_put:
 582        if (!noref)
 583                dst_release(&rt->dst);
 584        return -1;
 585
 586err_unreach:
 587        /* The ip6_link_failure function requires the dev field to be set
 588         * in order to get the net (further for the sake of fwmark
 589         * reflection).
 590         */
 591        if (!skb->dev)
 592                skb->dev = skb_dst(skb)->dev;
 593
 594        dst_link_failure(skb);
 595        return -1;
 596}
 597#endif
 598
 599
 600/* return NF_ACCEPT to allow forwarding or other NF_xxx on error */
 601static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
 602                                            struct ip_vs_conn *cp)
 603{
 604        int ret = NF_ACCEPT;
 605
 606        skb->ipvs_property = 1;
 607        if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
 608                ret = ip_vs_confirm_conntrack(skb);
 609        if (ret == NF_ACCEPT) {
 610                nf_reset_ct(skb);
 611                skb_forward_csum(skb);
 612                if (skb->dev)
 613                        skb->tstamp = 0;
 614        }
 615        return ret;
 616}
 617
 618/* In the event of a remote destination, it's possible that we would have
 619 * matches against an old socket (particularly a TIME-WAIT socket). This
 620 * causes havoc down the line (ip_local_out et. al. expect regular sockets
 621 * and invalid memory accesses will happen) so simply drop the association
 622 * in this case.
 623*/
 624static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb)
 625{
 626        /* If dev is set, the packet came from the LOCAL_IN callback and
 627         * not from a local TCP socket.
 628         */
 629        if (skb->dev)
 630                skb_orphan(skb);
 631}
 632
 633/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
 634static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
 635                                         struct ip_vs_conn *cp, int local)
 636{
 637        int ret = NF_STOLEN;
 638
 639        skb->ipvs_property = 1;
 640        if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
 641                ip_vs_notrack(skb);
 642        else
 643                ip_vs_update_conntrack(skb, cp, 1);
 644
 645        /* Remove the early_demux association unless it's bound for the
 646         * exact same port and address on this host after translation.
 647         */
 648        if (!local || cp->vport != cp->dport ||
 649            !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr))
 650                ip_vs_drop_early_demux_sk(skb);
 651
 652        if (!local) {
 653                skb_forward_csum(skb);
 654                if (skb->dev)
 655                        skb->tstamp = 0;
 656                NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
 657                        NULL, skb_dst(skb)->dev, dst_output);
 658        } else
 659                ret = NF_ACCEPT;
 660
 661        return ret;
 662}
 663
 664/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
 665static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
 666                                     struct ip_vs_conn *cp, int local)
 667{
 668        int ret = NF_STOLEN;
 669
 670        skb->ipvs_property = 1;
 671        if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
 672                ip_vs_notrack(skb);
 673        if (!local) {
 674                ip_vs_drop_early_demux_sk(skb);
 675                skb_forward_csum(skb);
 676                if (skb->dev)
 677                        skb->tstamp = 0;
 678                NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
 679                        NULL, skb_dst(skb)->dev, dst_output);
 680        } else
 681                ret = NF_ACCEPT;
 682        return ret;
 683}
 684
 685
 686/*
 687 *      NULL transmitter (do nothing except return NF_ACCEPT)
 688 */
 689int
 690ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 691                struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
 692{
 693        /* we do not touch skb and do not need pskb ptr */
 694        return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
 695}
 696
 697
 698/*
 699 *      Bypass transmitter
 700 *      Let packets bypass the destination when the destination is not
 701 *      available, it may be only used in transparent cache cluster.
 702 */
 703int
 704ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 705                  struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
 706{
 707        struct iphdr  *iph = ip_hdr(skb);
 708
 709        EnterFunction(10);
 710
 711        if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr,
 712                               IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0)
 713                goto tx_error;
 714
 715        ip_send_check(iph);
 716
 717        /* Another hack: avoid icmp_send in ip_fragment */
 718        skb->ignore_df = 1;
 719
 720        ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
 721
 722        LeaveFunction(10);
 723        return NF_STOLEN;
 724
 725 tx_error:
 726        kfree_skb(skb);
 727        LeaveFunction(10);
 728        return NF_STOLEN;
 729}
 730
 731#ifdef CONFIG_IP_VS_IPV6
 732int
 733ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 734                     struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
 735{
 736        struct ipv6hdr *iph = ipv6_hdr(skb);
 737
 738        EnterFunction(10);
 739
 740        if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL,
 741                                  &iph->daddr, NULL,
 742                                  ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
 743                goto tx_error;
 744
 745        /* Another hack: avoid icmp_send in ip_fragment */
 746        skb->ignore_df = 1;
 747
 748        ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
 749
 750        LeaveFunction(10);
 751        return NF_STOLEN;
 752
 753 tx_error:
 754        kfree_skb(skb);
 755        LeaveFunction(10);
 756        return NF_STOLEN;
 757}
 758#endif
 759
 760/*
 761 *      NAT transmitter (only for outside-to-inside nat forwarding)
 762 *      Not used for related ICMP
 763 */
 764int
 765ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 766               struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
 767{
 768        struct rtable *rt;              /* Route to the other host */
 769        int local, rc, was_input;
 770
 771        EnterFunction(10);
 772
 773        /* check if it is a connection of no-client-port */
 774        if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
 775                __be16 _pt, *p;
 776
 777                p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
 778                if (p == NULL)
 779                        goto tx_error;
 780                ip_vs_conn_fill_cport(cp, *p);
 781                IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
 782        }
 783
 784        was_input = rt_is_input_route(skb_rtable(skb));
 785        local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
 786                                   IP_VS_RT_MODE_LOCAL |
 787                                   IP_VS_RT_MODE_NON_LOCAL |
 788                                   IP_VS_RT_MODE_RDR, NULL, ipvsh);
 789        if (local < 0)
 790                goto tx_error;
 791        rt = skb_rtable(skb);
 792        /*
 793         * Avoid duplicate tuple in reply direction for NAT traffic
 794         * to local address when connection is sync-ed
 795         */
 796#if IS_ENABLED(CONFIG_NF_CONNTRACK)
 797        if (cp->flags & IP_VS_CONN_F_SYNC && local) {
 798                enum ip_conntrack_info ctinfo;
 799                struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
 800
 801                if (ct) {
 802                        IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off,
 803                                         "ip_vs_nat_xmit(): "
 804                                         "stopping DNAT to local address");
 805                        goto tx_error;
 806                }
 807        }
 808#endif
 809
 810        /* From world but DNAT to loopback address? */
 811        if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
 812                IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off,
 813                                 "ip_vs_nat_xmit(): stopping DNAT to loopback "
 814                                 "address");
 815                goto tx_error;
 816        }
 817
 818        /* copy-on-write the packet before mangling it */
 819        if (skb_ensure_writable(skb, sizeof(struct iphdr)))
 820                goto tx_error;
 821
 822        if (skb_cow(skb, rt->dst.dev->hard_header_len))
 823                goto tx_error;
 824
 825        /* mangle the packet */
 826        if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
 827                goto tx_error;
 828        ip_hdr(skb)->daddr = cp->daddr.ip;
 829        ip_send_check(ip_hdr(skb));
 830
 831        IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT");
 832
 833        /* FIXME: when application helper enlarges the packet and the length
 834           is larger than the MTU of outgoing device, there will be still
 835           MTU problem. */
 836
 837        /* Another hack: avoid icmp_send in ip_fragment */
 838        skb->ignore_df = 1;
 839
 840        rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
 841
 842        LeaveFunction(10);
 843        return rc;
 844
 845  tx_error:
 846        kfree_skb(skb);
 847        LeaveFunction(10);
 848        return NF_STOLEN;
 849}
 850
 851#ifdef CONFIG_IP_VS_IPV6
 852int
 853ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 854                  struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
 855{
 856        struct rt6_info *rt;            /* Route to the other host */
 857        int local, rc;
 858
 859        EnterFunction(10);
 860
 861        /* check if it is a connection of no-client-port */
 862        if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {
 863                __be16 _pt, *p;
 864                p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
 865                if (p == NULL)
 866                        goto tx_error;
 867                ip_vs_conn_fill_cport(cp, *p);
 868                IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
 869        }
 870
 871        local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
 872                                      &cp->daddr.in6,
 873                                      NULL, ipvsh, 0,
 874                                      IP_VS_RT_MODE_LOCAL |
 875                                      IP_VS_RT_MODE_NON_LOCAL |
 876                                      IP_VS_RT_MODE_RDR);
 877        if (local < 0)
 878                goto tx_error;
 879        rt = (struct rt6_info *) skb_dst(skb);
 880        /*
 881         * Avoid duplicate tuple in reply direction for NAT traffic
 882         * to local address when connection is sync-ed
 883         */
 884#if IS_ENABLED(CONFIG_NF_CONNTRACK)
 885        if (cp->flags & IP_VS_CONN_F_SYNC && local) {
 886                enum ip_conntrack_info ctinfo;
 887                struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
 888
 889                if (ct) {
 890                        IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, ipvsh->off,
 891                                         "ip_vs_nat_xmit_v6(): "
 892                                         "stopping DNAT to local address");
 893                        goto tx_error;
 894                }
 895        }
 896#endif
 897
 898        /* From world but DNAT to loopback address? */
 899        if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
 900            ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
 901                IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, ipvsh->off,
 902                                 "ip_vs_nat_xmit_v6(): "
 903                                 "stopping DNAT to loopback address");
 904                goto tx_error;
 905        }
 906
 907        /* copy-on-write the packet before mangling it */
 908        if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
 909                goto tx_error;
 910
 911        if (skb_cow(skb, rt->dst.dev->hard_header_len))
 912                goto tx_error;
 913
 914        /* mangle the packet */
 915        if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
 916                goto tx_error;
 917        ipv6_hdr(skb)->daddr = cp->daddr.in6;
 918
 919        IP_VS_DBG_PKT(10, AF_INET6, pp, skb, ipvsh->off, "After DNAT");
 920
 921        /* FIXME: when application helper enlarges the packet and the length
 922           is larger than the MTU of outgoing device, there will be still
 923           MTU problem. */
 924
 925        /* Another hack: avoid icmp_send in ip_fragment */
 926        skb->ignore_df = 1;
 927
 928        rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
 929
 930        LeaveFunction(10);
 931        return rc;
 932
 933tx_error:
 934        LeaveFunction(10);
 935        kfree_skb(skb);
 936        return NF_STOLEN;
 937}
 938#endif
 939
 940/* When forwarding a packet, we must ensure that we've got enough headroom
 941 * for the encapsulation packet in the skb.  This also gives us an
 942 * opportunity to figure out what the payload_len, dsfield, ttl, and df
 943 * values should be, so that we won't need to look at the old ip header
 944 * again
 945 */
 946static struct sk_buff *
 947ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
 948                           unsigned int max_headroom, __u8 *next_protocol,
 949                           __u32 *payload_len, __u8 *dsfield, __u8 *ttl,
 950                           __be16 *df)
 951{
 952        struct sk_buff *new_skb = NULL;
 953        struct iphdr *old_iph = NULL;
 954        __u8 old_dsfield;
 955#ifdef CONFIG_IP_VS_IPV6
 956        struct ipv6hdr *old_ipv6h = NULL;
 957#endif
 958
 959        ip_vs_drop_early_demux_sk(skb);
 960
 961        if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
 962                new_skb = skb_realloc_headroom(skb, max_headroom);
 963                if (!new_skb)
 964                        goto error;
 965                if (skb->sk)
 966                        skb_set_owner_w(new_skb, skb->sk);
 967                consume_skb(skb);
 968                skb = new_skb;
 969        }
 970
 971#ifdef CONFIG_IP_VS_IPV6
 972        if (skb_af == AF_INET6) {
 973                old_ipv6h = ipv6_hdr(skb);
 974                *next_protocol = IPPROTO_IPV6;
 975                if (payload_len)
 976                        *payload_len =
 977                                ntohs(old_ipv6h->payload_len) +
 978                                sizeof(*old_ipv6h);
 979                old_dsfield = ipv6_get_dsfield(old_ipv6h);
 980                *ttl = old_ipv6h->hop_limit;
 981                if (df)
 982                        *df = 0;
 983        } else
 984#endif
 985        {
 986                old_iph = ip_hdr(skb);
 987                /* Copy DF, reset fragment offset and MF */
 988                if (df)
 989                        *df = (old_iph->frag_off & htons(IP_DF));
 990                *next_protocol = IPPROTO_IPIP;
 991
 992                /* fix old IP header checksum */
 993                ip_send_check(old_iph);
 994                old_dsfield = ipv4_get_dsfield(old_iph);
 995                *ttl = old_iph->ttl;
 996                if (payload_len)
 997                        *payload_len = ntohs(old_iph->tot_len);
 998        }
 999
1000        /* Implement full-functionality option for ECN encapsulation */
1001        *dsfield = INET_ECN_encapsulate(old_dsfield, old_dsfield);
1002
1003        return skb;
1004error:
1005        kfree_skb(skb);
1006        return ERR_PTR(-ENOMEM);
1007}
1008
1009static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
1010{
1011        switch (encaps_af) {
1012        case AF_INET:
1013                return SKB_GSO_IPXIP4;
1014        case AF_INET6:
1015                return SKB_GSO_IPXIP6;
1016        default:
1017                return 0;
1018        }
1019}
1020
1021static int
1022ipvs_gue_encap(struct net *net, struct sk_buff *skb,
1023               struct ip_vs_conn *cp, __u8 *next_protocol)
1024{
1025        __be16 dport;
1026        __be16 sport = udp_flow_src_port(net, skb, 0, 0, false);
1027        struct udphdr  *udph;   /* Our new UDP header */
1028        struct guehdr  *gueh;   /* Our new GUE header */
1029        size_t hdrlen, optlen = 0;
1030        void *data;
1031        bool need_priv = false;
1032
1033        if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1034            skb->ip_summed == CHECKSUM_PARTIAL) {
1035                optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
1036                need_priv = true;
1037        }
1038
1039        hdrlen = sizeof(struct guehdr) + optlen;
1040
1041        skb_push(skb, hdrlen);
1042
1043        gueh = (struct guehdr *)skb->data;
1044
1045        gueh->control = 0;
1046        gueh->version = 0;
1047        gueh->hlen = optlen >> 2;
1048        gueh->flags = 0;
1049        gueh->proto_ctype = *next_protocol;
1050
1051        data = &gueh[1];
1052
1053        if (need_priv) {
1054                __be32 *flags = data;
1055                u16 csum_start = skb_checksum_start_offset(skb);
1056                __be16 *pd;
1057
1058                gueh->flags |= GUE_FLAG_PRIV;
1059                *flags = 0;
1060                data += GUE_LEN_PRIV;
1061
1062                if (csum_start < hdrlen)
1063                        return -EINVAL;
1064
1065                csum_start -= hdrlen;
1066                pd = data;
1067                pd[0] = htons(csum_start);
1068                pd[1] = htons(csum_start + skb->csum_offset);
1069
1070                if (!skb_is_gso(skb)) {
1071                        skb->ip_summed = CHECKSUM_NONE;
1072                        skb->encapsulation = 0;
1073                }
1074
1075                *flags |= GUE_PFLAG_REMCSUM;
1076                data += GUE_PLEN_REMCSUM;
1077        }
1078
1079        skb_push(skb, sizeof(struct udphdr));
1080        skb_reset_transport_header(skb);
1081
1082        udph = udp_hdr(skb);
1083
1084        dport = cp->dest->tun_port;
1085        udph->dest = dport;
1086        udph->source = sport;
1087        udph->len = htons(skb->len);
1088        udph->check = 0;
1089
1090        *next_protocol = IPPROTO_UDP;
1091
1092        return 0;
1093}
1094
1095static void
1096ipvs_gre_encap(struct net *net, struct sk_buff *skb,
1097               struct ip_vs_conn *cp, __u8 *next_protocol)
1098{
1099        __be16 proto = *next_protocol == IPPROTO_IPIP ?
1100                                htons(ETH_P_IP) : htons(ETH_P_IPV6);
1101        __be16 tflags = 0;
1102        size_t hdrlen;
1103
1104        if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1105                tflags |= TUNNEL_CSUM;
1106
1107        hdrlen = gre_calc_hlen(tflags);
1108        gre_build_header(skb, hdrlen, tflags, proto, 0, 0);
1109
1110        *next_protocol = IPPROTO_GRE;
1111}
1112
1113/*
1114 *   IP Tunneling transmitter
1115 *
1116 *   This function encapsulates the packet in a new IP packet, its
1117 *   destination will be set to cp->daddr. Most code of this function
1118 *   is taken from ipip.c.
1119 *
1120 *   It is used in VS/TUN cluster. The load balancer selects a real
1121 *   server from a cluster based on a scheduling algorithm,
1122 *   encapsulates the request packet and forwards it to the selected
1123 *   server. For example, all real servers are configured with
1124 *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
1125 *   the encapsulated packet, it will decapsulate the packet, processe
1126 *   the request and return the response packets directly to the client
1127 *   without passing the load balancer. This can greatly increase the
1128 *   scalability of virtual server.
1129 *
1130 *   Used for ANY protocol
1131 */
1132int
1133ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
1134                  struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
1135{
1136        struct netns_ipvs *ipvs = cp->ipvs;
1137        struct net *net = ipvs->net;
1138        struct rtable *rt;                      /* Route to the other host */
1139        __be32 saddr;                           /* Source for tunnel */
1140        struct net_device *tdev;                /* Device to other host */
1141        __u8 next_protocol = 0;
1142        __u8 dsfield = 0;
1143        __u8 ttl = 0;
1144        __be16 df = 0;
1145        __be16 *dfp = NULL;
1146        struct iphdr  *iph;                     /* Our new IP header */
1147        unsigned int max_headroom;              /* The extra header space needed */
1148        int ret, local;
1149        int tun_type, gso_type;
1150        int tun_flags;
1151
1152        EnterFunction(10);
1153
1154        local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
1155                                   IP_VS_RT_MODE_LOCAL |
1156                                   IP_VS_RT_MODE_NON_LOCAL |
1157                                   IP_VS_RT_MODE_CONNECT |
1158                                   IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh);
1159        if (local < 0)
1160                goto tx_error;
1161        if (local)
1162                return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
1163
1164        rt = skb_rtable(skb);
1165        tdev = rt->dst.dev;
1166
1167        /*
1168         * Okay, now see if we can stuff it in the buffer as-is.
1169         */
1170        max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
1171
1172        tun_type = cp->dest->tun_type;
1173        tun_flags = cp->dest->tun_flags;
1174
1175        if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1176                size_t gue_hdrlen, gue_optlen = 0;
1177
1178                if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1179                    skb->ip_summed == CHECKSUM_PARTIAL) {
1180                        gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
1181                }
1182                gue_hdrlen = sizeof(struct guehdr) + gue_optlen;
1183
1184                max_headroom += sizeof(struct udphdr) + gue_hdrlen;
1185        } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1186                size_t gre_hdrlen;
1187                __be16 tflags = 0;
1188
1189                if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1190                        tflags |= TUNNEL_CSUM;
1191                gre_hdrlen = gre_calc_hlen(tflags);
1192
1193                max_headroom += gre_hdrlen;
1194        }
1195
1196        /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
1197        dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
1198        skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
1199                                         &next_protocol, NULL, &dsfield,
1200                                         &ttl, dfp);
1201        if (IS_ERR(skb))
1202                goto tx_error;
1203
1204        gso_type = __tun_gso_type_mask(AF_INET, cp->af);
1205        if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1206                if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
1207                    (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
1208                        gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
1209                else
1210                        gso_type |= SKB_GSO_UDP_TUNNEL;
1211                if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1212                    skb->ip_summed == CHECKSUM_PARTIAL) {
1213                        gso_type |= SKB_GSO_TUNNEL_REMCSUM;
1214                }
1215        } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1216                if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1217                        gso_type |= SKB_GSO_GRE_CSUM;
1218                else
1219                        gso_type |= SKB_GSO_GRE;
1220        }
1221
1222        if (iptunnel_handle_offloads(skb, gso_type))
1223                goto tx_error;
1224
1225        skb->transport_header = skb->network_header;
1226
1227        skb_set_inner_ipproto(skb, next_protocol);
1228
1229        if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1230                bool check = false;
1231
1232                if (ipvs_gue_encap(net, skb, cp, &next_protocol))
1233                        goto tx_error;
1234
1235                if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
1236                    (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
1237                        check = true;
1238
1239                udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len);
1240        } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE)
1241                ipvs_gre_encap(net, skb, cp, &next_protocol);
1242
1243        skb_push(skb, sizeof(struct iphdr));
1244        skb_reset_network_header(skb);
1245        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1246
1247        /*
1248         *      Push down and install the IPIP header.
1249         */
1250        iph                     =       ip_hdr(skb);
1251        iph->version            =       4;
1252        iph->ihl                =       sizeof(struct iphdr)>>2;
1253        iph->frag_off           =       df;
1254        iph->protocol           =       next_protocol;
1255        iph->tos                =       dsfield;
1256        iph->daddr              =       cp->daddr.ip;
1257        iph->saddr              =       saddr;
1258        iph->ttl                =       ttl;
1259        ip_select_ident(net, skb, NULL);
1260
1261        /* Another hack: avoid icmp_send in ip_fragment */
1262        skb->ignore_df = 1;
1263
1264        ret = ip_vs_tunnel_xmit_prepare(skb, cp);
1265        if (ret == NF_ACCEPT)
1266                ip_local_out(net, skb->sk, skb);
1267        else if (ret == NF_DROP)
1268                kfree_skb(skb);
1269
1270        LeaveFunction(10);
1271
1272        return NF_STOLEN;
1273
1274  tx_error:
1275        if (!IS_ERR(skb))
1276                kfree_skb(skb);
1277        LeaveFunction(10);
1278        return NF_STOLEN;
1279}
1280
1281#ifdef CONFIG_IP_VS_IPV6
1282int
1283ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
1284                     struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
1285{
1286        struct netns_ipvs *ipvs = cp->ipvs;
1287        struct net *net = ipvs->net;
1288        struct rt6_info *rt;            /* Route to the other host */
1289        struct in6_addr saddr;          /* Source for tunnel */
1290        struct net_device *tdev;        /* Device to other host */
1291        __u8 next_protocol = 0;
1292        __u32 payload_len = 0;
1293        __u8 dsfield = 0;
1294        __u8 ttl = 0;
1295        struct ipv6hdr  *iph;           /* Our new IP header */
1296        unsigned int max_headroom;      /* The extra header space needed */
1297        int ret, local;
1298        int tun_type, gso_type;
1299        int tun_flags;
1300
1301        EnterFunction(10);
1302
1303        local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest,
1304                                      &cp->daddr.in6,
1305                                      &saddr, ipvsh, 1,
1306                                      IP_VS_RT_MODE_LOCAL |
1307                                      IP_VS_RT_MODE_NON_LOCAL |
1308                                      IP_VS_RT_MODE_TUNNEL);
1309        if (local < 0)
1310                goto tx_error;
1311        if (local)
1312                return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
1313
1314        rt = (struct rt6_info *) skb_dst(skb);
1315        tdev = rt->dst.dev;
1316
1317        /*
1318         * Okay, now see if we can stuff it in the buffer as-is.
1319         */
1320        max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
1321
1322        tun_type = cp->dest->tun_type;
1323        tun_flags = cp->dest->tun_flags;
1324
1325        if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1326                size_t gue_hdrlen, gue_optlen = 0;
1327
1328                if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1329                    skb->ip_summed == CHECKSUM_PARTIAL) {
1330                        gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
1331                }
1332                gue_hdrlen = sizeof(struct guehdr) + gue_optlen;
1333
1334                max_headroom += sizeof(struct udphdr) + gue_hdrlen;
1335        } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1336                size_t gre_hdrlen;
1337                __be16 tflags = 0;
1338
1339                if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1340                        tflags |= TUNNEL_CSUM;
1341                gre_hdrlen = gre_calc_hlen(tflags);
1342
1343                max_headroom += gre_hdrlen;
1344        }
1345
1346        skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
1347                                         &next_protocol, &payload_len,
1348                                         &dsfield, &ttl, NULL);
1349        if (IS_ERR(skb))
1350                goto tx_error;
1351
1352        gso_type = __tun_gso_type_mask(AF_INET6, cp->af);
1353        if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1354                if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
1355                    (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
1356                        gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
1357                else
1358                        gso_type |= SKB_GSO_UDP_TUNNEL;
1359                if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1360                    skb->ip_summed == CHECKSUM_PARTIAL) {
1361                        gso_type |= SKB_GSO_TUNNEL_REMCSUM;
1362                }
1363        } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1364                if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1365                        gso_type |= SKB_GSO_GRE_CSUM;
1366                else
1367                        gso_type |= SKB_GSO_GRE;
1368        }
1369
1370        if (iptunnel_handle_offloads(skb, gso_type))
1371                goto tx_error;
1372
1373        skb->transport_header = skb->network_header;
1374
1375        skb_set_inner_ipproto(skb, next_protocol);
1376
1377        if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1378                bool check = false;
1379
1380                if (ipvs_gue_encap(net, skb, cp, &next_protocol))
1381                        goto tx_error;
1382
1383                if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
1384                    (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
1385                        check = true;
1386
1387                udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len);
1388        } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE)
1389                ipvs_gre_encap(net, skb, cp, &next_protocol);
1390
1391        skb_push(skb, sizeof(struct ipv6hdr));
1392        skb_reset_network_header(skb);
1393        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1394
1395        /*
1396         *      Push down and install the IPIP header.
1397         */
1398        iph                     =       ipv6_hdr(skb);
1399        iph->version            =       6;
1400        iph->nexthdr            =       next_protocol;
1401        iph->payload_len        =       htons(payload_len);
1402        memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
1403        ipv6_change_dsfield(iph, 0, dsfield);
1404        iph->daddr = cp->daddr.in6;
1405        iph->saddr = saddr;
1406        iph->hop_limit          =       ttl;
1407
1408        /* Another hack: avoid icmp_send in ip_fragment */
1409        skb->ignore_df = 1;
1410
1411        ret = ip_vs_tunnel_xmit_prepare(skb, cp);
1412        if (ret == NF_ACCEPT)
1413                ip6_local_out(net, skb->sk, skb);
1414        else if (ret == NF_DROP)
1415                kfree_skb(skb);
1416
1417        LeaveFunction(10);
1418
1419        return NF_STOLEN;
1420
1421tx_error:
1422        if (!IS_ERR(skb))
1423                kfree_skb(skb);
1424        LeaveFunction(10);
1425        return NF_STOLEN;
1426}
1427#endif
1428
1429
1430/*
1431 *      Direct Routing transmitter
1432 *      Used for ANY protocol
1433 */
1434int
1435ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
1436              struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
1437{
1438        int local;
1439
1440        EnterFunction(10);
1441
1442        local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
1443                                   IP_VS_RT_MODE_LOCAL |
1444                                   IP_VS_RT_MODE_NON_LOCAL |
1445                                   IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh);
1446        if (local < 0)
1447                goto tx_error;
1448        if (local)
1449                return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
1450
1451        ip_send_check(ip_hdr(skb));
1452
1453        /* Another hack: avoid icmp_send in ip_fragment */
1454        skb->ignore_df = 1;
1455
1456        ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
1457
1458        LeaveFunction(10);
1459        return NF_STOLEN;
1460
1461  tx_error:
1462        kfree_skb(skb);
1463        LeaveFunction(10);
1464        return NF_STOLEN;
1465}
1466
1467#ifdef CONFIG_IP_VS_IPV6
1468int
1469ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
1470                 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
1471{
1472        int local;
1473
1474        EnterFunction(10);
1475
1476        local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
1477                                      &cp->daddr.in6,
1478                                      NULL, ipvsh, 0,
1479                                      IP_VS_RT_MODE_LOCAL |
1480                                      IP_VS_RT_MODE_NON_LOCAL |
1481                                      IP_VS_RT_MODE_KNOWN_NH);
1482        if (local < 0)
1483                goto tx_error;
1484        if (local)
1485                return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
1486
1487        /* Another hack: avoid icmp_send in ip_fragment */
1488        skb->ignore_df = 1;
1489
1490        ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
1491
1492        LeaveFunction(10);
1493        return NF_STOLEN;
1494
1495tx_error:
1496        kfree_skb(skb);
1497        LeaveFunction(10);
1498        return NF_STOLEN;
1499}
1500#endif
1501
1502
1503/*
1504 *      ICMP packet transmitter
1505 *      called by the ip_vs_in_icmp
1506 */
1507int
1508ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
1509                struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
1510                struct ip_vs_iphdr *iph)
1511{
1512        struct rtable   *rt;    /* Route to the other host */
1513        int rc;
1514        int local;
1515        int rt_mode, was_input;
1516
1517        EnterFunction(10);
1518
1519        /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
1520           forwarded directly here, because there is no need to
1521           translate address/port back */
1522        if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
1523                if (cp->packet_xmit)
1524                        rc = cp->packet_xmit(skb, cp, pp, iph);
1525                else
1526                        rc = NF_ACCEPT;
1527                /* do not touch skb anymore */
1528                atomic_inc(&cp->in_pkts);
1529                goto out;
1530        }
1531
1532        /*
1533         * mangle and send the packet here (only for VS/NAT)
1534         */
1535        was_input = rt_is_input_route(skb_rtable(skb));
1536
1537        /* LOCALNODE from FORWARD hook is not supported */
1538        rt_mode = (hooknum != NF_INET_FORWARD) ?
1539                  IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
1540                  IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
1541        local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, rt_mode,
1542                                   NULL, iph);
1543        if (local < 0)
1544                goto tx_error;
1545        rt = skb_rtable(skb);
1546
1547        /*
1548         * Avoid duplicate tuple in reply direction for NAT traffic
1549         * to local address when connection is sync-ed
1550         */
1551#if IS_ENABLED(CONFIG_NF_CONNTRACK)
1552        if (cp->flags & IP_VS_CONN_F_SYNC && local) {
1553                enum ip_conntrack_info ctinfo;
1554                struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
1555
1556                if (ct) {
1557                        IP_VS_DBG(10, "%s(): "
1558                                  "stopping DNAT to local address %pI4\n",
1559                                  __func__, &cp->daddr.ip);
1560                        goto tx_error;
1561                }
1562        }
1563#endif
1564
1565        /* From world but DNAT to loopback address? */
1566        if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
1567                IP_VS_DBG(1, "%s(): "
1568                          "stopping DNAT to loopback %pI4\n",
1569                          __func__, &cp->daddr.ip);
1570                goto tx_error;
1571        }
1572
1573        /* copy-on-write the packet before mangling it */
1574        if (skb_ensure_writable(skb, offset))
1575                goto tx_error;
1576
1577        if (skb_cow(skb, rt->dst.dev->hard_header_len))
1578                goto tx_error;
1579
1580        ip_vs_nat_icmp(skb, pp, cp, 0);
1581
1582        /* Another hack: avoid icmp_send in ip_fragment */
1583        skb->ignore_df = 1;
1584
1585        rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
1586        goto out;
1587
1588  tx_error:
1589        kfree_skb(skb);
1590        rc = NF_STOLEN;
1591  out:
1592        LeaveFunction(10);
1593        return rc;
1594}
1595
1596#ifdef CONFIG_IP_VS_IPV6
1597int
1598ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
1599                struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
1600                struct ip_vs_iphdr *ipvsh)
1601{
1602        struct rt6_info *rt;    /* Route to the other host */
1603        int rc;
1604        int local;
1605        int rt_mode;
1606
1607        EnterFunction(10);
1608
1609        /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
1610           forwarded directly here, because there is no need to
1611           translate address/port back */
1612        if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
1613                if (cp->packet_xmit)
1614                        rc = cp->packet_xmit(skb, cp, pp, ipvsh);
1615                else
1616                        rc = NF_ACCEPT;
1617                /* do not touch skb anymore */
1618                atomic_inc(&cp->in_pkts);
1619                goto out;
1620        }
1621
1622        /*
1623         * mangle and send the packet here (only for VS/NAT)
1624         */
1625
1626        /* LOCALNODE from FORWARD hook is not supported */
1627        rt_mode = (hooknum != NF_INET_FORWARD) ?
1628                  IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
1629                  IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
1630        local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
1631                                      &cp->daddr.in6, NULL, ipvsh, 0, rt_mode);
1632        if (local < 0)
1633                goto tx_error;
1634        rt = (struct rt6_info *) skb_dst(skb);
1635        /*
1636         * Avoid duplicate tuple in reply direction for NAT traffic
1637         * to local address when connection is sync-ed
1638         */
1639#if IS_ENABLED(CONFIG_NF_CONNTRACK)
1640        if (cp->flags & IP_VS_CONN_F_SYNC && local) {
1641                enum ip_conntrack_info ctinfo;
1642                struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
1643
1644                if (ct) {
1645                        IP_VS_DBG(10, "%s(): "
1646                                  "stopping DNAT to local address %pI6\n",
1647                                  __func__, &cp->daddr.in6);
1648                        goto tx_error;
1649                }
1650        }
1651#endif
1652
1653        /* From world but DNAT to loopback address? */
1654        if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
1655            ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
1656                IP_VS_DBG(1, "%s(): "
1657                          "stopping DNAT to loopback %pI6\n",
1658                          __func__, &cp->daddr.in6);
1659                goto tx_error;
1660        }
1661
1662        /* copy-on-write the packet before mangling it */
1663        if (skb_ensure_writable(skb, offset))
1664                goto tx_error;
1665
1666        if (skb_cow(skb, rt->dst.dev->hard_header_len))
1667                goto tx_error;
1668
1669        ip_vs_nat_icmp_v6(skb, pp, cp, 0);
1670
1671        /* Another hack: avoid icmp_send in ip_fragment */
1672        skb->ignore_df = 1;
1673
1674        rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
1675        goto out;
1676
1677tx_error:
1678        kfree_skb(skb);
1679        rc = NF_STOLEN;
1680out:
1681        LeaveFunction(10);
1682        return rc;
1683}
1684#endif
1685