linux/net/netfilter/ipvs/ip_vs_core.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * IPVS         An implementation of the IP virtual server support for the
   4 *              LINUX operating system.  IPVS is now implemented as a module
   5 *              over the Netfilter framework. IPVS can be used to build a
   6 *              high-performance and highly available server based on a
   7 *              cluster of servers.
   8 *
   9 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
  10 *              Peter Kese <peter.kese@ijs.si>
  11 *              Julian Anastasov <ja@ssi.bg>
  12 *
  13 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
  14 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
  15 * and others.
  16 *
  17 * Changes:
  18 *      Paul `Rusty' Russell            properly handle non-linear skbs
  19 *      Harald Welte                    don't use nfcache
  20 */
  21
  22#define KMSG_COMPONENT "IPVS"
  23#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  24
  25#include <linux/module.h>
  26#include <linux/kernel.h>
  27#include <linux/ip.h>
  28#include <linux/tcp.h>
  29#include <linux/sctp.h>
  30#include <linux/icmp.h>
  31#include <linux/slab.h>
  32
  33#include <net/ip.h>
  34#include <net/tcp.h>
  35#include <net/udp.h>
  36#include <net/icmp.h>                   /* for icmp_send */
  37#include <net/gue.h>
  38#include <net/gre.h>
  39#include <net/route.h>
  40#include <net/ip6_checksum.h>
  41#include <net/netns/generic.h>          /* net_generic() */
  42
  43#include <linux/netfilter.h>
  44#include <linux/netfilter_ipv4.h>
  45
  46#ifdef CONFIG_IP_VS_IPV6
  47#include <net/ipv6.h>
  48#include <linux/netfilter_ipv6.h>
  49#include <net/ip6_route.h>
  50#endif
  51
  52#include <net/ip_vs.h>
  53#include <linux/indirect_call_wrapper.h>
  54
  55
  56EXPORT_SYMBOL(register_ip_vs_scheduler);
  57EXPORT_SYMBOL(unregister_ip_vs_scheduler);
  58EXPORT_SYMBOL(ip_vs_proto_name);
  59EXPORT_SYMBOL(ip_vs_conn_new);
  60EXPORT_SYMBOL(ip_vs_conn_in_get);
  61EXPORT_SYMBOL(ip_vs_conn_out_get);
  62#ifdef CONFIG_IP_VS_PROTO_TCP
  63EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
  64#endif
  65EXPORT_SYMBOL(ip_vs_conn_put);
  66#ifdef CONFIG_IP_VS_DEBUG
  67EXPORT_SYMBOL(ip_vs_get_debug_level);
  68#endif
  69EXPORT_SYMBOL(ip_vs_new_conn_out);
  70
  71#ifdef CONFIG_IP_VS_PROTO_TCP
  72INDIRECT_CALLABLE_DECLARE(int
  73        tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
  74                         struct ip_vs_conn *cp, struct ip_vs_iphdr *iph));
  75#endif
  76
  77#ifdef CONFIG_IP_VS_PROTO_UDP
  78INDIRECT_CALLABLE_DECLARE(int
  79        udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
  80                         struct ip_vs_conn *cp, struct ip_vs_iphdr *iph));
  81#endif
  82
  83#if defined(CONFIG_IP_VS_PROTO_TCP) && defined(CONFIG_IP_VS_PROTO_UDP)
  84#define SNAT_CALL(f, ...) \
  85        INDIRECT_CALL_2(f, tcp_snat_handler, udp_snat_handler, __VA_ARGS__)
  86#elif defined(CONFIG_IP_VS_PROTO_TCP)
  87#define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, tcp_snat_handler, __VA_ARGS__)
  88#elif defined(CONFIG_IP_VS_PROTO_UDP)
  89#define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, udp_snat_handler, __VA_ARGS__)
  90#else
  91#define SNAT_CALL(f, ...) f(__VA_ARGS__)
  92#endif
  93
  94static unsigned int ip_vs_net_id __read_mostly;
  95/* netns cnt used for uniqueness */
  96static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
  97
  98/* ID used in ICMP lookups */
  99#define icmp_id(icmph)          (((icmph)->un).echo.id)
 100#define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
 101
 102const char *ip_vs_proto_name(unsigned int proto)
 103{
 104        static char buf[20];
 105
 106        switch (proto) {
 107        case IPPROTO_IP:
 108                return "IP";
 109        case IPPROTO_UDP:
 110                return "UDP";
 111        case IPPROTO_TCP:
 112                return "TCP";
 113        case IPPROTO_SCTP:
 114                return "SCTP";
 115        case IPPROTO_ICMP:
 116                return "ICMP";
 117#ifdef CONFIG_IP_VS_IPV6
 118        case IPPROTO_ICMPV6:
 119                return "ICMPv6";
 120#endif
 121        default:
 122                sprintf(buf, "IP_%u", proto);
 123                return buf;
 124        }
 125}
 126
 127void ip_vs_init_hash_table(struct list_head *table, int rows)
 128{
 129        while (--rows >= 0)
 130                INIT_LIST_HEAD(&table[rows]);
 131}
 132
 133static inline void
 134ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 135{
 136        struct ip_vs_dest *dest = cp->dest;
 137        struct netns_ipvs *ipvs = cp->ipvs;
 138
 139        if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 140                struct ip_vs_cpu_stats *s;
 141                struct ip_vs_service *svc;
 142
 143                local_bh_disable();
 144
 145                s = this_cpu_ptr(dest->stats.cpustats);
 146                u64_stats_update_begin(&s->syncp);
 147                s->cnt.inpkts++;
 148                s->cnt.inbytes += skb->len;
 149                u64_stats_update_end(&s->syncp);
 150
 151                svc = rcu_dereference(dest->svc);
 152                s = this_cpu_ptr(svc->stats.cpustats);
 153                u64_stats_update_begin(&s->syncp);
 154                s->cnt.inpkts++;
 155                s->cnt.inbytes += skb->len;
 156                u64_stats_update_end(&s->syncp);
 157
 158                s = this_cpu_ptr(ipvs->tot_stats.cpustats);
 159                u64_stats_update_begin(&s->syncp);
 160                s->cnt.inpkts++;
 161                s->cnt.inbytes += skb->len;
 162                u64_stats_update_end(&s->syncp);
 163
 164                local_bh_enable();
 165        }
 166}
 167
 168
 169static inline void
 170ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 171{
 172        struct ip_vs_dest *dest = cp->dest;
 173        struct netns_ipvs *ipvs = cp->ipvs;
 174
 175        if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 176                struct ip_vs_cpu_stats *s;
 177                struct ip_vs_service *svc;
 178
 179                local_bh_disable();
 180
 181                s = this_cpu_ptr(dest->stats.cpustats);
 182                u64_stats_update_begin(&s->syncp);
 183                s->cnt.outpkts++;
 184                s->cnt.outbytes += skb->len;
 185                u64_stats_update_end(&s->syncp);
 186
 187                svc = rcu_dereference(dest->svc);
 188                s = this_cpu_ptr(svc->stats.cpustats);
 189                u64_stats_update_begin(&s->syncp);
 190                s->cnt.outpkts++;
 191                s->cnt.outbytes += skb->len;
 192                u64_stats_update_end(&s->syncp);
 193
 194                s = this_cpu_ptr(ipvs->tot_stats.cpustats);
 195                u64_stats_update_begin(&s->syncp);
 196                s->cnt.outpkts++;
 197                s->cnt.outbytes += skb->len;
 198                u64_stats_update_end(&s->syncp);
 199
 200                local_bh_enable();
 201        }
 202}
 203
 204
 205static inline void
 206ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
 207{
 208        struct netns_ipvs *ipvs = svc->ipvs;
 209        struct ip_vs_cpu_stats *s;
 210
 211        local_bh_disable();
 212
 213        s = this_cpu_ptr(cp->dest->stats.cpustats);
 214        u64_stats_update_begin(&s->syncp);
 215        s->cnt.conns++;
 216        u64_stats_update_end(&s->syncp);
 217
 218        s = this_cpu_ptr(svc->stats.cpustats);
 219        u64_stats_update_begin(&s->syncp);
 220        s->cnt.conns++;
 221        u64_stats_update_end(&s->syncp);
 222
 223        s = this_cpu_ptr(ipvs->tot_stats.cpustats);
 224        u64_stats_update_begin(&s->syncp);
 225        s->cnt.conns++;
 226        u64_stats_update_end(&s->syncp);
 227
 228        local_bh_enable();
 229}
 230
 231
 232static inline void
 233ip_vs_set_state(struct ip_vs_conn *cp, int direction,
 234                const struct sk_buff *skb,
 235                struct ip_vs_proto_data *pd)
 236{
 237        if (likely(pd->pp->state_transition))
 238                pd->pp->state_transition(cp, direction, skb, pd);
 239}
 240
 241static inline int
 242ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
 243                              struct sk_buff *skb, int protocol,
 244                              const union nf_inet_addr *caddr, __be16 cport,
 245                              const union nf_inet_addr *vaddr, __be16 vport,
 246                              struct ip_vs_conn_param *p)
 247{
 248        ip_vs_conn_fill_param(svc->ipvs, svc->af, protocol, caddr, cport, vaddr,
 249                              vport, p);
 250        p->pe = rcu_dereference(svc->pe);
 251        if (p->pe && p->pe->fill_param)
 252                return p->pe->fill_param(p, skb);
 253
 254        return 0;
 255}
 256
 257/*
 258 *  IPVS persistent scheduling function
 259 *  It creates a connection entry according to its template if exists,
 260 *  or selects a server and creates a connection entry plus a template.
 261 *  Locking: we are svc user (svc->refcnt), so we hold all dests too
 262 *  Protocols supported: TCP, UDP
 263 */
 264static struct ip_vs_conn *
 265ip_vs_sched_persist(struct ip_vs_service *svc,
 266                    struct sk_buff *skb, __be16 src_port, __be16 dst_port,
 267                    int *ignored, struct ip_vs_iphdr *iph)
 268{
 269        struct ip_vs_conn *cp = NULL;
 270        struct ip_vs_dest *dest;
 271        struct ip_vs_conn *ct;
 272        __be16 dport = 0;               /* destination port to forward */
 273        unsigned int flags;
 274        struct ip_vs_conn_param param;
 275        const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
 276        union nf_inet_addr snet;        /* source network of the client,
 277                                           after masking */
 278        const union nf_inet_addr *src_addr, *dst_addr;
 279
 280        if (likely(!ip_vs_iph_inverse(iph))) {
 281                src_addr = &iph->saddr;
 282                dst_addr = &iph->daddr;
 283        } else {
 284                src_addr = &iph->daddr;
 285                dst_addr = &iph->saddr;
 286        }
 287
 288
 289        /* Mask saddr with the netmask to adjust template granularity */
 290#ifdef CONFIG_IP_VS_IPV6
 291        if (svc->af == AF_INET6)
 292                ipv6_addr_prefix(&snet.in6, &src_addr->in6,
 293                                 (__force __u32) svc->netmask);
 294        else
 295#endif
 296                snet.ip = src_addr->ip & svc->netmask;
 297
 298        IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
 299                      "mnet %s\n",
 300                      IP_VS_DBG_ADDR(svc->af, src_addr), ntohs(src_port),
 301                      IP_VS_DBG_ADDR(svc->af, dst_addr), ntohs(dst_port),
 302                      IP_VS_DBG_ADDR(svc->af, &snet));
 303
 304        /*
 305         * As far as we know, FTP is a very complicated network protocol, and
 306         * it uses control connection and data connections. For active FTP,
 307         * FTP server initialize data connection to the client, its source port
 308         * is often 20. For passive FTP, FTP server tells the clients the port
 309         * that it passively listens to,  and the client issues the data
 310         * connection. In the tunneling or direct routing mode, the load
 311         * balancer is on the client-to-server half of connection, the port
 312         * number is unknown to the load balancer. So, a conn template like
 313         * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
 314         * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
 315         * is created for other persistent services.
 316         */
 317        {
 318                int protocol = iph->protocol;
 319                const union nf_inet_addr *vaddr = dst_addr;
 320                __be16 vport = 0;
 321
 322                if (dst_port == svc->port) {
 323                        /* non-FTP template:
 324                         * <protocol, caddr, 0, vaddr, vport, daddr, dport>
 325                         * FTP template:
 326                         * <protocol, caddr, 0, vaddr, 0, daddr, 0>
 327                         */
 328                        if (svc->port != FTPPORT)
 329                                vport = dst_port;
 330                } else {
 331                        /* Note: persistent fwmark-based services and
 332                         * persistent port zero service are handled here.
 333                         * fwmark template:
 334                         * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
 335                         * port zero template:
 336                         * <protocol,caddr,0,vaddr,0,daddr,0>
 337                         */
 338                        if (svc->fwmark) {
 339                                protocol = IPPROTO_IP;
 340                                vaddr = &fwmark;
 341                        }
 342                }
 343                /* return *ignored = -1 so NF_DROP can be used */
 344                if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
 345                                                  vaddr, vport, &param) < 0) {
 346                        *ignored = -1;
 347                        return NULL;
 348                }
 349        }
 350
 351        /* Check if a template already exists */
 352        ct = ip_vs_ct_in_get(&param);
 353        if (!ct || !ip_vs_check_template(ct, NULL)) {
 354                struct ip_vs_scheduler *sched;
 355
 356                /*
 357                 * No template found or the dest of the connection
 358                 * template is not available.
 359                 * return *ignored=0 i.e. ICMP and NF_DROP
 360                 */
 361                sched = rcu_dereference(svc->scheduler);
 362                if (sched) {
 363                        /* read svc->sched_data after svc->scheduler */
 364                        smp_rmb();
 365                        dest = sched->schedule(svc, skb, iph);
 366                } else {
 367                        dest = NULL;
 368                }
 369                if (!dest) {
 370                        IP_VS_DBG(1, "p-schedule: no dest found.\n");
 371                        kfree(param.pe_data);
 372                        *ignored = 0;
 373                        return NULL;
 374                }
 375
 376                if (dst_port == svc->port && svc->port != FTPPORT)
 377                        dport = dest->port;
 378
 379                /* Create a template
 380                 * This adds param.pe_data to the template,
 381                 * and thus param.pe_data will be destroyed
 382                 * when the template expires */
 383                ct = ip_vs_conn_new(&param, dest->af, &dest->addr, dport,
 384                                    IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
 385                if (ct == NULL) {
 386                        kfree(param.pe_data);
 387                        *ignored = -1;
 388                        return NULL;
 389                }
 390
 391                ct->timeout = svc->timeout;
 392        } else {
 393                /* set destination with the found template */
 394                dest = ct->dest;
 395                kfree(param.pe_data);
 396        }
 397
 398        dport = dst_port;
 399        if (dport == svc->port && dest->port)
 400                dport = dest->port;
 401
 402        flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
 403                 && iph->protocol == IPPROTO_UDP) ?
 404                IP_VS_CONN_F_ONE_PACKET : 0;
 405
 406        /*
 407         *    Create a new connection according to the template
 408         */
 409        ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, src_addr,
 410                              src_port, dst_addr, dst_port, &param);
 411
 412        cp = ip_vs_conn_new(&param, dest->af, &dest->addr, dport, flags, dest,
 413                            skb->mark);
 414        if (cp == NULL) {
 415                ip_vs_conn_put(ct);
 416                *ignored = -1;
 417                return NULL;
 418        }
 419
 420        /*
 421         *    Add its control
 422         */
 423        ip_vs_control_add(cp, ct);
 424        ip_vs_conn_put(ct);
 425
 426        ip_vs_conn_stats(cp, svc);
 427        return cp;
 428}
 429
 430
 431/*
 432 *  IPVS main scheduling function
 433 *  It selects a server according to the virtual service, and
 434 *  creates a connection entry.
 435 *  Protocols supported: TCP, UDP
 436 *
 437 *  Usage of *ignored
 438 *
 439 * 1 :   protocol tried to schedule (eg. on SYN), found svc but the
 440 *       svc/scheduler decides that this packet should be accepted with
 441 *       NF_ACCEPT because it must not be scheduled.
 442 *
 443 * 0 :   scheduler can not find destination, so try bypass or
 444 *       return ICMP and then NF_DROP (ip_vs_leave).
 445 *
 446 * -1 :  scheduler tried to schedule but fatal error occurred, eg.
 447 *       ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
 448 *       failure such as missing Call-ID, ENOMEM on skb_linearize
 449 *       or pe_data. In this case we should return NF_DROP without
 450 *       any attempts to send ICMP with ip_vs_leave.
 451 */
 452struct ip_vs_conn *
 453ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 454               struct ip_vs_proto_data *pd, int *ignored,
 455               struct ip_vs_iphdr *iph)
 456{
 457        struct ip_vs_protocol *pp = pd->pp;
 458        struct ip_vs_conn *cp = NULL;
 459        struct ip_vs_scheduler *sched;
 460        struct ip_vs_dest *dest;
 461        __be16 _ports[2], *pptr, cport, vport;
 462        const void *caddr, *vaddr;
 463        unsigned int flags;
 464
 465        *ignored = 1;
 466        /*
 467         * IPv6 frags, only the first hit here.
 468         */
 469        pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports);
 470        if (pptr == NULL)
 471                return NULL;
 472
 473        if (likely(!ip_vs_iph_inverse(iph))) {
 474                cport = pptr[0];
 475                caddr = &iph->saddr;
 476                vport = pptr[1];
 477                vaddr = &iph->daddr;
 478        } else {
 479                cport = pptr[1];
 480                caddr = &iph->daddr;
 481                vport = pptr[0];
 482                vaddr = &iph->saddr;
 483        }
 484
 485        /*
 486         * FTPDATA needs this check when using local real server.
 487         * Never schedule Active FTPDATA connections from real server.
 488         * For LVS-NAT they must be already created. For other methods
 489         * with persistence the connection is created on SYN+ACK.
 490         */
 491        if (cport == FTPDATA) {
 492                IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off,
 493                              "Not scheduling FTPDATA");
 494                return NULL;
 495        }
 496
 497        /*
 498         *    Do not schedule replies from local real server.
 499         */
 500        if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK)) {
 501                iph->hdr_flags ^= IP_VS_HDR_INVERSE;
 502                cp = INDIRECT_CALL_1(pp->conn_in_get,
 503                                     ip_vs_conn_in_get_proto, svc->ipvs,
 504                                     svc->af, skb, iph);
 505                iph->hdr_flags ^= IP_VS_HDR_INVERSE;
 506
 507                if (cp) {
 508                        IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off,
 509                                      "Not scheduling reply for existing"
 510                                      " connection");
 511                        __ip_vs_conn_put(cp);
 512                        return NULL;
 513                }
 514        }
 515
 516        /*
 517         *    Persistent service
 518         */
 519        if (svc->flags & IP_VS_SVC_F_PERSISTENT)
 520                return ip_vs_sched_persist(svc, skb, cport, vport, ignored,
 521                                           iph);
 522
 523        *ignored = 0;
 524
 525        /*
 526         *    Non-persistent service
 527         */
 528        if (!svc->fwmark && vport != svc->port) {
 529                if (!svc->port)
 530                        pr_err("Schedule: port zero only supported "
 531                               "in persistent services, "
 532                               "check your ipvs configuration\n");
 533                return NULL;
 534        }
 535
 536        sched = rcu_dereference(svc->scheduler);
 537        if (sched) {
 538                /* read svc->sched_data after svc->scheduler */
 539                smp_rmb();
 540                dest = sched->schedule(svc, skb, iph);
 541        } else {
 542                dest = NULL;
 543        }
 544        if (dest == NULL) {
 545                IP_VS_DBG(1, "Schedule: no dest found.\n");
 546                return NULL;
 547        }
 548
 549        flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
 550                 && iph->protocol == IPPROTO_UDP) ?
 551                IP_VS_CONN_F_ONE_PACKET : 0;
 552
 553        /*
 554         *    Create a connection entry.
 555         */
 556        {
 557                struct ip_vs_conn_param p;
 558
 559                ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol,
 560                                      caddr, cport, vaddr, vport, &p);
 561                cp = ip_vs_conn_new(&p, dest->af, &dest->addr,
 562                                    dest->port ? dest->port : vport,
 563                                    flags, dest, skb->mark);
 564                if (!cp) {
 565                        *ignored = -1;
 566                        return NULL;
 567                }
 568        }
 569
 570        IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
 571                      "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
 572                      ip_vs_fwd_tag(cp),
 573                      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
 574                      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
 575                      IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
 576                      cp->flags, refcount_read(&cp->refcnt));
 577
 578        ip_vs_conn_stats(cp, svc);
 579        return cp;
 580}
 581
 582static inline int ip_vs_addr_is_unicast(struct net *net, int af,
 583                                        union nf_inet_addr *addr)
 584{
 585#ifdef CONFIG_IP_VS_IPV6
 586        if (af == AF_INET6)
 587                return ipv6_addr_type(&addr->in6) & IPV6_ADDR_UNICAST;
 588#endif
 589        return (inet_addr_type(net, addr->ip) == RTN_UNICAST);
 590}
 591
 592/*
 593 *  Pass or drop the packet.
 594 *  Called by ip_vs_in, when the virtual service is available but
 595 *  no destination is available for a new connection.
 596 */
 597int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 598                struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph)
 599{
 600        __be16 _ports[2], *pptr, dport;
 601        struct netns_ipvs *ipvs = svc->ipvs;
 602        struct net *net = ipvs->net;
 603
 604        pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports);
 605        if (!pptr)
 606                return NF_DROP;
 607        dport = likely(!ip_vs_iph_inverse(iph)) ? pptr[1] : pptr[0];
 608
 609        /* if it is fwmark-based service, the cache_bypass sysctl is up
 610           and the destination is a non-local unicast, then create
 611           a cache_bypass connection entry */
 612        if (sysctl_cache_bypass(ipvs) && svc->fwmark &&
 613            !(iph->hdr_flags & (IP_VS_HDR_INVERSE | IP_VS_HDR_ICMP)) &&
 614            ip_vs_addr_is_unicast(net, svc->af, &iph->daddr)) {
 615                int ret;
 616                struct ip_vs_conn *cp;
 617                unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
 618                                      iph->protocol == IPPROTO_UDP) ?
 619                                      IP_VS_CONN_F_ONE_PACKET : 0;
 620                union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
 621
 622                /* create a new connection entry */
 623                IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
 624                {
 625                        struct ip_vs_conn_param p;
 626                        ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol,
 627                                              &iph->saddr, pptr[0],
 628                                              &iph->daddr, pptr[1], &p);
 629                        cp = ip_vs_conn_new(&p, svc->af, &daddr, 0,
 630                                            IP_VS_CONN_F_BYPASS | flags,
 631                                            NULL, skb->mark);
 632                        if (!cp)
 633                                return NF_DROP;
 634                }
 635
 636                /* statistics */
 637                ip_vs_in_stats(cp, skb);
 638
 639                /* set state */
 640                ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
 641
 642                /* transmit the first SYN packet */
 643                ret = cp->packet_xmit(skb, cp, pd->pp, iph);
 644                /* do not touch skb anymore */
 645
 646                if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control)
 647                        atomic_inc(&cp->control->in_pkts);
 648                else
 649                        atomic_inc(&cp->in_pkts);
 650                ip_vs_conn_put(cp);
 651                return ret;
 652        }
 653
 654        /*
 655         * When the virtual ftp service is presented, packets destined
 656         * for other services on the VIP may get here (except services
 657         * listed in the ipvs table), pass the packets, because it is
 658         * not ipvs job to decide to drop the packets.
 659         */
 660        if (svc->port == FTPPORT && dport != FTPPORT)
 661                return NF_ACCEPT;
 662
 663        if (unlikely(ip_vs_iph_icmp(iph)))
 664                return NF_DROP;
 665
 666        /*
 667         * Notify the client that the destination is unreachable, and
 668         * release the socket buffer.
 669         * Since it is in IP layer, the TCP socket is not actually
 670         * created, the TCP RST packet cannot be sent, instead that
 671         * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
 672         */
 673#ifdef CONFIG_IP_VS_IPV6
 674        if (svc->af == AF_INET6) {
 675                if (!skb->dev)
 676                        skb->dev = net->loopback_dev;
 677                icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
 678        } else
 679#endif
 680                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 681
 682        return NF_DROP;
 683}
 684
 685#ifdef CONFIG_SYSCTL
 686
 687static int sysctl_snat_reroute(struct netns_ipvs *ipvs)
 688{
 689        return ipvs->sysctl_snat_reroute;
 690}
 691
 692static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs)
 693{
 694        return ipvs->sysctl_nat_icmp_send;
 695}
 696
 697#else
 698
 699static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; }
 700static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; }
 701
 702#endif
 703
 704__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
 705{
 706        return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
 707}
 708
 709static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
 710{
 711        if (NF_INET_LOCAL_IN == hooknum)
 712                return IP_DEFRAG_VS_IN;
 713        if (NF_INET_FORWARD == hooknum)
 714                return IP_DEFRAG_VS_FWD;
 715        return IP_DEFRAG_VS_OUT;
 716}
 717
 718static inline int ip_vs_gather_frags(struct netns_ipvs *ipvs,
 719                                     struct sk_buff *skb, u_int32_t user)
 720{
 721        int err;
 722
 723        local_bh_disable();
 724        err = ip_defrag(ipvs->net, skb, user);
 725        local_bh_enable();
 726        if (!err)
 727                ip_send_check(ip_hdr(skb));
 728
 729        return err;
 730}
 731
 732static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af,
 733                                 struct sk_buff *skb, unsigned int hooknum)
 734{
 735        if (!sysctl_snat_reroute(ipvs))
 736                return 0;
 737        /* Reroute replies only to remote clients (FORWARD and LOCAL_OUT) */
 738        if (NF_INET_LOCAL_IN == hooknum)
 739                return 0;
 740#ifdef CONFIG_IP_VS_IPV6
 741        if (af == AF_INET6) {
 742                struct dst_entry *dst = skb_dst(skb);
 743
 744                if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) &&
 745                    ip6_route_me_harder(ipvs->net, skb) != 0)
 746                        return 1;
 747        } else
 748#endif
 749                if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
 750                    ip_route_me_harder(ipvs->net, skb, RTN_LOCAL) != 0)
 751                        return 1;
 752
 753        return 0;
 754}
 755
 756/*
 757 * Packet has been made sufficiently writable in caller
 758 * - inout: 1=in->out, 0=out->in
 759 */
 760void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
 761                    struct ip_vs_conn *cp, int inout)
 762{
 763        struct iphdr *iph        = ip_hdr(skb);
 764        unsigned int icmp_offset = iph->ihl*4;
 765        struct icmphdr *icmph    = (struct icmphdr *)(skb_network_header(skb) +
 766                                                      icmp_offset);
 767        struct iphdr *ciph       = (struct iphdr *)(icmph + 1);
 768
 769        if (inout) {
 770                iph->saddr = cp->vaddr.ip;
 771                ip_send_check(iph);
 772                ciph->daddr = cp->vaddr.ip;
 773                ip_send_check(ciph);
 774        } else {
 775                iph->daddr = cp->daddr.ip;
 776                ip_send_check(iph);
 777                ciph->saddr = cp->daddr.ip;
 778                ip_send_check(ciph);
 779        }
 780
 781        /* the TCP/UDP/SCTP port */
 782        if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol ||
 783            IPPROTO_SCTP == ciph->protocol) {
 784                __be16 *ports = (void *)ciph + ciph->ihl*4;
 785
 786                if (inout)
 787                        ports[1] = cp->vport;
 788                else
 789                        ports[0] = cp->dport;
 790        }
 791
 792        /* And finally the ICMP checksum */
 793        icmph->checksum = 0;
 794        icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
 795        skb->ip_summed = CHECKSUM_UNNECESSARY;
 796
 797        if (inout)
 798                IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
 799                        "Forwarding altered outgoing ICMP");
 800        else
 801                IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
 802                        "Forwarding altered incoming ICMP");
 803}
 804
 805#ifdef CONFIG_IP_VS_IPV6
 806void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
 807                    struct ip_vs_conn *cp, int inout)
 808{
 809        struct ipv6hdr *iph      = ipv6_hdr(skb);
 810        unsigned int icmp_offset = 0;
 811        unsigned int offs        = 0; /* header offset*/
 812        int protocol;
 813        struct icmp6hdr *icmph;
 814        struct ipv6hdr *ciph;
 815        unsigned short fragoffs;
 816
 817        ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL);
 818        icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset);
 819        offs = icmp_offset + sizeof(struct icmp6hdr);
 820        ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs);
 821
 822        protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL);
 823
 824        if (inout) {
 825                iph->saddr = cp->vaddr.in6;
 826                ciph->daddr = cp->vaddr.in6;
 827        } else {
 828                iph->daddr = cp->daddr.in6;
 829                ciph->saddr = cp->daddr.in6;
 830        }
 831
 832        /* the TCP/UDP/SCTP port */
 833        if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
 834                          IPPROTO_SCTP == protocol)) {
 835                __be16 *ports = (void *)(skb_network_header(skb) + offs);
 836
 837                IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__,
 838                              ntohs(inout ? ports[1] : ports[0]),
 839                              ntohs(inout ? cp->vport : cp->dport));
 840                if (inout)
 841                        ports[1] = cp->vport;
 842                else
 843                        ports[0] = cp->dport;
 844        }
 845
 846        /* And finally the ICMP checksum */
 847        icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr,
 848                                              skb->len - icmp_offset,
 849                                              IPPROTO_ICMPV6, 0);
 850        skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset;
 851        skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
 852        skb->ip_summed = CHECKSUM_PARTIAL;
 853
 854        if (inout)
 855                IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
 856                              (void *)ciph - (void *)iph,
 857                              "Forwarding altered outgoing ICMPv6");
 858        else
 859                IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
 860                              (void *)ciph - (void *)iph,
 861                              "Forwarding altered incoming ICMPv6");
 862}
 863#endif
 864
 865/* Handle relevant response ICMP messages - forward to the right
 866 * destination host.
 867 */
 868static int handle_response_icmp(int af, struct sk_buff *skb,
 869                                union nf_inet_addr *snet,
 870                                __u8 protocol, struct ip_vs_conn *cp,
 871                                struct ip_vs_protocol *pp,
 872                                unsigned int offset, unsigned int ihl,
 873                                unsigned int hooknum)
 874{
 875        unsigned int verdict = NF_DROP;
 876
 877        if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
 878                goto ignore_cp;
 879
 880        /* Ensure the checksum is correct */
 881        if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
 882                /* Failed checksum! */
 883                IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
 884                              IP_VS_DBG_ADDR(af, snet));
 885                goto out;
 886        }
 887
 888        if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
 889            IPPROTO_SCTP == protocol)
 890                offset += 2 * sizeof(__u16);
 891        if (skb_ensure_writable(skb, offset))
 892                goto out;
 893
 894#ifdef CONFIG_IP_VS_IPV6
 895        if (af == AF_INET6)
 896                ip_vs_nat_icmp_v6(skb, pp, cp, 1);
 897        else
 898#endif
 899                ip_vs_nat_icmp(skb, pp, cp, 1);
 900
 901        if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum))
 902                goto out;
 903
 904        /* do the statistics and put it back */
 905        ip_vs_out_stats(cp, skb);
 906
 907        skb->ipvs_property = 1;
 908        if (!(cp->flags & IP_VS_CONN_F_NFCT))
 909                ip_vs_notrack(skb);
 910        else
 911                ip_vs_update_conntrack(skb, cp, 0);
 912
 913ignore_cp:
 914        verdict = NF_ACCEPT;
 915
 916out:
 917        __ip_vs_conn_put(cp);
 918
 919        return verdict;
 920}
 921
 922/*
 923 *      Handle ICMP messages in the inside-to-outside direction (outgoing).
 924 *      Find any that might be relevant, check against existing connections.
 925 *      Currently handles error types - unreachable, quench, ttl exceeded.
 926 */
 927static int ip_vs_out_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb,
 928                          int *related, unsigned int hooknum)
 929{
 930        struct iphdr *iph;
 931        struct icmphdr  _icmph, *ic;
 932        struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
 933        struct ip_vs_iphdr ciph;
 934        struct ip_vs_conn *cp;
 935        struct ip_vs_protocol *pp;
 936        unsigned int offset, ihl;
 937        union nf_inet_addr snet;
 938
 939        *related = 1;
 940
 941        /* reassemble IP fragments */
 942        if (ip_is_fragment(ip_hdr(skb))) {
 943                if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum)))
 944                        return NF_STOLEN;
 945        }
 946
 947        iph = ip_hdr(skb);
 948        offset = ihl = iph->ihl * 4;
 949        ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
 950        if (ic == NULL)
 951                return NF_DROP;
 952
 953        IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
 954                  ic->type, ntohs(icmp_id(ic)),
 955                  &iph->saddr, &iph->daddr);
 956
 957        /*
 958         * Work through seeing if this is for us.
 959         * These checks are supposed to be in an order that means easy
 960         * things are checked first to speed up processing.... however
 961         * this means that some packets will manage to get a long way
 962         * down this stack and then be rejected, but that's life.
 963         */
 964        if ((ic->type != ICMP_DEST_UNREACH) &&
 965            (ic->type != ICMP_SOURCE_QUENCH) &&
 966            (ic->type != ICMP_TIME_EXCEEDED)) {
 967                *related = 0;
 968                return NF_ACCEPT;
 969        }
 970
 971        /* Now find the contained IP header */
 972        offset += sizeof(_icmph);
 973        cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
 974        if (cih == NULL)
 975                return NF_ACCEPT; /* The packet looks wrong, ignore */
 976
 977        pp = ip_vs_proto_get(cih->protocol);
 978        if (!pp)
 979                return NF_ACCEPT;
 980
 981        /* Is the embedded protocol header present? */
 982        if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
 983                     pp->dont_defrag))
 984                return NF_ACCEPT;
 985
 986        IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
 987                      "Checking outgoing ICMP for");
 988
 989        ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, true, &ciph);
 990
 991        /* The embedded headers contain source and dest in reverse order */
 992        cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto,
 993                             ipvs, AF_INET, skb, &ciph);
 994        if (!cp)
 995                return NF_ACCEPT;
 996
 997        snet.ip = iph->saddr;
 998        return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
 999                                    pp, ciph.len, ihl, hooknum);
1000}
1001
1002#ifdef CONFIG_IP_VS_IPV6
1003static int ip_vs_out_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb,
1004                             int *related,  unsigned int hooknum,
1005                             struct ip_vs_iphdr *ipvsh)
1006{
1007        struct icmp6hdr _icmph, *ic;
1008        struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */
1009        struct ip_vs_conn *cp;
1010        struct ip_vs_protocol *pp;
1011        union nf_inet_addr snet;
1012        unsigned int offset;
1013
1014        *related = 1;
1015        ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph);
1016        if (ic == NULL)
1017                return NF_DROP;
1018
1019        /*
1020         * Work through seeing if this is for us.
1021         * These checks are supposed to be in an order that means easy
1022         * things are checked first to speed up processing.... however
1023         * this means that some packets will manage to get a long way
1024         * down this stack and then be rejected, but that's life.
1025         */
1026        if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) {
1027                *related = 0;
1028                return NF_ACCEPT;
1029        }
1030        /* Fragment header that is before ICMP header tells us that:
1031         * it's not an error message since they can't be fragmented.
1032         */
1033        if (ipvsh->flags & IP6_FH_F_FRAG)
1034                return NF_DROP;
1035
1036        IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n",
1037                  ic->icmp6_type, ntohs(icmpv6_id(ic)),
1038                  &ipvsh->saddr, &ipvsh->daddr);
1039
1040        if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, ipvsh->len + sizeof(_icmph),
1041                                     true, &ciph))
1042                return NF_ACCEPT; /* The packet looks wrong, ignore */
1043
1044        pp = ip_vs_proto_get(ciph.protocol);
1045        if (!pp)
1046                return NF_ACCEPT;
1047
1048        /* The embedded headers contain source and dest in reverse order */
1049        cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto,
1050                             ipvs, AF_INET6, skb, &ciph);
1051        if (!cp)
1052                return NF_ACCEPT;
1053
1054        snet.in6 = ciph.saddr.in6;
1055        offset = ciph.len;
1056        return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp,
1057                                    pp, offset, sizeof(struct ipv6hdr),
1058                                    hooknum);
1059}
1060#endif
1061
1062/*
1063 * Check if sctp chunc is ABORT chunk
1064 */
1065static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len)
1066{
1067        struct sctp_chunkhdr *sch, schunk;
1068        sch = skb_header_pointer(skb, nh_len + sizeof(struct sctphdr),
1069                                 sizeof(schunk), &schunk);
1070        if (sch == NULL)
1071                return 0;
1072        if (sch->type == SCTP_CID_ABORT)
1073                return 1;
1074        return 0;
1075}
1076
1077static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
1078{
1079        struct tcphdr _tcph, *th;
1080
1081        th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
1082        if (th == NULL)
1083                return 0;
1084        return th->rst;
1085}
1086
1087static inline bool is_new_conn(const struct sk_buff *skb,
1088                               struct ip_vs_iphdr *iph)
1089{
1090        switch (iph->protocol) {
1091        case IPPROTO_TCP: {
1092                struct tcphdr _tcph, *th;
1093
1094                th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
1095                if (th == NULL)
1096                        return false;
1097                return th->syn;
1098        }
1099        case IPPROTO_SCTP: {
1100                struct sctp_chunkhdr *sch, schunk;
1101
1102                sch = skb_header_pointer(skb, iph->len + sizeof(struct sctphdr),
1103                                         sizeof(schunk), &schunk);
1104                if (sch == NULL)
1105                        return false;
1106                return sch->type == SCTP_CID_INIT;
1107        }
1108        default:
1109                return false;
1110        }
1111}
1112
1113static inline bool is_new_conn_expected(const struct ip_vs_conn *cp,
1114                                        int conn_reuse_mode)
1115{
1116        /* Controlled (FTP DATA or persistence)? */
1117        if (cp->control)
1118                return false;
1119
1120        switch (cp->protocol) {
1121        case IPPROTO_TCP:
1122                return (cp->state == IP_VS_TCP_S_TIME_WAIT) ||
1123                       (cp->state == IP_VS_TCP_S_CLOSE) ||
1124                        ((conn_reuse_mode & 2) &&
1125                         (cp->state == IP_VS_TCP_S_FIN_WAIT) &&
1126                         (cp->flags & IP_VS_CONN_F_NOOUTPUT));
1127        case IPPROTO_SCTP:
1128                return cp->state == IP_VS_SCTP_S_CLOSED;
1129        default:
1130                return false;
1131        }
1132}
1133
1134/* Generic function to create new connections for outgoing RS packets
1135 *
1136 * Pre-requisites for successful connection creation:
1137 * 1) Virtual Service is NOT fwmark based:
1138 *    In fwmark-VS actual vaddr and vport are unknown to IPVS
1139 * 2) Real Server and Virtual Service were NOT configured without port:
1140 *    This is to allow match of different VS to the same RS ip-addr
1141 */
1142struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
1143                                      struct ip_vs_dest *dest,
1144                                      struct sk_buff *skb,
1145                                      const struct ip_vs_iphdr *iph,
1146                                      __be16 dport,
1147                                      __be16 cport)
1148{
1149        struct ip_vs_conn_param param;
1150        struct ip_vs_conn *ct = NULL, *cp = NULL;
1151        const union nf_inet_addr *vaddr, *daddr, *caddr;
1152        union nf_inet_addr snet;
1153        __be16 vport;
1154        unsigned int flags;
1155
1156        EnterFunction(12);
1157        vaddr = &svc->addr;
1158        vport = svc->port;
1159        daddr = &iph->saddr;
1160        caddr = &iph->daddr;
1161
1162        /* check pre-requisites are satisfied */
1163        if (svc->fwmark)
1164                return NULL;
1165        if (!vport || !dport)
1166                return NULL;
1167
1168        /* for persistent service first create connection template */
1169        if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
1170                /* apply netmask the same way ingress-side does */
1171#ifdef CONFIG_IP_VS_IPV6
1172                if (svc->af == AF_INET6)
1173                        ipv6_addr_prefix(&snet.in6, &caddr->in6,
1174                                         (__force __u32)svc->netmask);
1175                else
1176#endif
1177                        snet.ip = caddr->ip & svc->netmask;
1178                /* fill params and create template if not existent */
1179                if (ip_vs_conn_fill_param_persist(svc, skb, iph->protocol,
1180                                                  &snet, 0, vaddr,
1181                                                  vport, &param) < 0)
1182                        return NULL;
1183                ct = ip_vs_ct_in_get(&param);
1184                /* check if template exists and points to the same dest */
1185                if (!ct || !ip_vs_check_template(ct, dest)) {
1186                        ct = ip_vs_conn_new(&param, dest->af, daddr, dport,
1187                                            IP_VS_CONN_F_TEMPLATE, dest, 0);
1188                        if (!ct) {
1189                                kfree(param.pe_data);
1190                                return NULL;
1191                        }
1192                        ct->timeout = svc->timeout;
1193                } else {
1194                        kfree(param.pe_data);
1195                }
1196        }
1197
1198        /* connection flags */
1199        flags = ((svc->flags & IP_VS_SVC_F_ONEPACKET) &&
1200                 iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0;
1201        /* create connection */
1202        ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol,
1203                              caddr, cport, vaddr, vport, &param);
1204        cp = ip_vs_conn_new(&param, dest->af, daddr, dport, flags, dest, 0);
1205        if (!cp) {
1206                if (ct)
1207                        ip_vs_conn_put(ct);
1208                return NULL;
1209        }
1210        if (ct) {
1211                ip_vs_control_add(cp, ct);
1212                ip_vs_conn_put(ct);
1213        }
1214        ip_vs_conn_stats(cp, svc);
1215
1216        /* return connection (will be used to handle outgoing packet) */
1217        IP_VS_DBG_BUF(6, "New connection RS-initiated:%c c:%s:%u v:%s:%u "
1218                      "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
1219                      ip_vs_fwd_tag(cp),
1220                      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
1221                      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
1222                      IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
1223                      cp->flags, refcount_read(&cp->refcnt));
1224        LeaveFunction(12);
1225        return cp;
1226}
1227
1228/* Handle outgoing packets which are considered requests initiated by
1229 * real servers, so that subsequent responses from external client can be
1230 * routed to the right real server.
1231 * Used also for outgoing responses in OPS mode.
1232 *
1233 * Connection management is handled by persistent-engine specific callback.
1234 */
1235static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum,
1236                                              struct netns_ipvs *ipvs,
1237                                              int af, struct sk_buff *skb,
1238                                              const struct ip_vs_iphdr *iph)
1239{
1240        struct ip_vs_dest *dest;
1241        struct ip_vs_conn *cp = NULL;
1242        __be16 _ports[2], *pptr;
1243
1244        if (hooknum == NF_INET_LOCAL_IN)
1245                return NULL;
1246
1247        pptr = frag_safe_skb_hp(skb, iph->len,
1248                                sizeof(_ports), _ports);
1249        if (!pptr)
1250                return NULL;
1251
1252        dest = ip_vs_find_real_service(ipvs, af, iph->protocol,
1253                                       &iph->saddr, pptr[0]);
1254        if (dest) {
1255                struct ip_vs_service *svc;
1256                struct ip_vs_pe *pe;
1257
1258                svc = rcu_dereference(dest->svc);
1259                if (svc) {
1260                        pe = rcu_dereference(svc->pe);
1261                        if (pe && pe->conn_out)
1262                                cp = pe->conn_out(svc, dest, skb, iph,
1263                                                  pptr[0], pptr[1]);
1264                }
1265        }
1266
1267        return cp;
1268}
1269
1270/* Handle response packets: rewrite addresses and send away...
1271 */
1272static unsigned int
1273handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
1274                struct ip_vs_conn *cp, struct ip_vs_iphdr *iph,
1275                unsigned int hooknum)
1276{
1277        struct ip_vs_protocol *pp = pd->pp;
1278
1279        IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet");
1280
1281        if (skb_ensure_writable(skb, iph->len))
1282                goto drop;
1283
1284        /* mangle the packet */
1285        if (pp->snat_handler &&
1286            !SNAT_CALL(pp->snat_handler, skb, pp, cp, iph))
1287                goto drop;
1288
1289#ifdef CONFIG_IP_VS_IPV6
1290        if (af == AF_INET6)
1291                ipv6_hdr(skb)->saddr = cp->vaddr.in6;
1292        else
1293#endif
1294        {
1295                ip_hdr(skb)->saddr = cp->vaddr.ip;
1296                ip_send_check(ip_hdr(skb));
1297        }
1298
1299        /*
1300         * nf_iterate does not expect change in the skb->dst->dev.
1301         * It looks like it is not fatal to enable this code for hooks
1302         * where our handlers are at the end of the chain list and
1303         * when all next handlers use skb->dst->dev and not outdev.
1304         * It will definitely route properly the inout NAT traffic
1305         * when multiple paths are used.
1306         */
1307
1308        /* For policy routing, packets originating from this
1309         * machine itself may be routed differently to packets
1310         * passing through.  We want this packet to be routed as
1311         * if it came from this machine itself.  So re-compute
1312         * the routing information.
1313         */
1314        if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum))
1315                goto drop;
1316
1317        IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT");
1318
1319        ip_vs_out_stats(cp, skb);
1320        ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
1321        skb->ipvs_property = 1;
1322        if (!(cp->flags & IP_VS_CONN_F_NFCT))
1323                ip_vs_notrack(skb);
1324        else
1325                ip_vs_update_conntrack(skb, cp, 0);
1326        ip_vs_conn_put(cp);
1327
1328        LeaveFunction(11);
1329        return NF_ACCEPT;
1330
1331drop:
1332        ip_vs_conn_put(cp);
1333        kfree_skb(skb);
1334        LeaveFunction(11);
1335        return NF_STOLEN;
1336}
1337
1338/*
1339 *      Check if outgoing packet belongs to the established ip_vs_conn.
1340 */
1341static unsigned int
1342ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af)
1343{
1344        struct ip_vs_iphdr iph;
1345        struct ip_vs_protocol *pp;
1346        struct ip_vs_proto_data *pd;
1347        struct ip_vs_conn *cp;
1348        struct sock *sk;
1349
1350        EnterFunction(11);
1351
1352        /* Already marked as IPVS request or reply? */
1353        if (skb->ipvs_property)
1354                return NF_ACCEPT;
1355
1356        sk = skb_to_full_sk(skb);
1357        /* Bad... Do not break raw sockets */
1358        if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
1359                     af == AF_INET)) {
1360
1361                if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)
1362                        return NF_ACCEPT;
1363        }
1364
1365        if (unlikely(!skb_dst(skb)))
1366                return NF_ACCEPT;
1367
1368        if (!ipvs->enable)
1369                return NF_ACCEPT;
1370
1371        ip_vs_fill_iph_skb(af, skb, false, &iph);
1372#ifdef CONFIG_IP_VS_IPV6
1373        if (af == AF_INET6) {
1374                if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1375                        int related;
1376                        int verdict = ip_vs_out_icmp_v6(ipvs, skb, &related,
1377                                                        hooknum, &iph);
1378
1379                        if (related)
1380                                return verdict;
1381                }
1382        } else
1383#endif
1384                if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1385                        int related;
1386                        int verdict = ip_vs_out_icmp(ipvs, skb, &related, hooknum);
1387
1388                        if (related)
1389                                return verdict;
1390                }
1391
1392        pd = ip_vs_proto_data_get(ipvs, iph.protocol);
1393        if (unlikely(!pd))
1394                return NF_ACCEPT;
1395        pp = pd->pp;
1396
1397        /* reassemble IP fragments */
1398#ifdef CONFIG_IP_VS_IPV6
1399        if (af == AF_INET)
1400#endif
1401                if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {
1402                        if (ip_vs_gather_frags(ipvs, skb,
1403                                               ip_vs_defrag_user(hooknum)))
1404                                return NF_STOLEN;
1405
1406                        ip_vs_fill_iph_skb(AF_INET, skb, false, &iph);
1407                }
1408
1409        /*
1410         * Check if the packet belongs to an existing entry
1411         */
1412        cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto,
1413                             ipvs, af, skb, &iph);
1414
1415        if (likely(cp)) {
1416                if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
1417                        goto ignore_cp;
1418                return handle_response(af, skb, pd, cp, &iph, hooknum);
1419        }
1420
1421        /* Check for real-server-started requests */
1422        if (atomic_read(&ipvs->conn_out_counter)) {
1423                /* Currently only for UDP:
1424                 * connection oriented protocols typically use
1425                 * ephemeral ports for outgoing connections, so
1426                 * related incoming responses would not match any VS
1427                 */
1428                if (pp->protocol == IPPROTO_UDP) {
1429                        cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph);
1430                        if (likely(cp))
1431                                return handle_response(af, skb, pd, cp, &iph,
1432                                                       hooknum);
1433                }
1434        }
1435
1436        if (sysctl_nat_icmp_send(ipvs) &&
1437            (pp->protocol == IPPROTO_TCP ||
1438             pp->protocol == IPPROTO_UDP ||
1439             pp->protocol == IPPROTO_SCTP)) {
1440                __be16 _ports[2], *pptr;
1441
1442                pptr = frag_safe_skb_hp(skb, iph.len,
1443                                         sizeof(_ports), _ports);
1444                if (pptr == NULL)
1445                        return NF_ACCEPT;       /* Not for me */
1446                if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr,
1447                                           pptr[0])) {
1448                        /*
1449                         * Notify the real server: there is no
1450                         * existing entry if it is not RST
1451                         * packet or not TCP packet.
1452                         */
1453                        if ((iph.protocol != IPPROTO_TCP &&
1454                             iph.protocol != IPPROTO_SCTP)
1455                             || ((iph.protocol == IPPROTO_TCP
1456                                  && !is_tcp_reset(skb, iph.len))
1457                                 || (iph.protocol == IPPROTO_SCTP
1458                                        && !is_sctp_abort(skb,
1459                                                iph.len)))) {
1460#ifdef CONFIG_IP_VS_IPV6
1461                                if (af == AF_INET6) {
1462                                        if (!skb->dev)
1463                                                skb->dev = ipvs->net->loopback_dev;
1464                                        icmpv6_send(skb,
1465                                                    ICMPV6_DEST_UNREACH,
1466                                                    ICMPV6_PORT_UNREACH,
1467                                                    0);
1468                                } else
1469#endif
1470                                        icmp_send(skb,
1471                                                  ICMP_DEST_UNREACH,
1472                                                  ICMP_PORT_UNREACH, 0);
1473                                return NF_DROP;
1474                        }
1475                }
1476        }
1477
1478out:
1479        IP_VS_DBG_PKT(12, af, pp, skb, iph.off,
1480                      "ip_vs_out: packet continues traversal as normal");
1481        return NF_ACCEPT;
1482
1483ignore_cp:
1484        __ip_vs_conn_put(cp);
1485        goto out;
1486}
1487
1488/*
1489 *      It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1490 *      used only for VS/NAT.
1491 *      Check if packet is reply for established ip_vs_conn.
1492 */
1493static unsigned int
1494ip_vs_reply4(void *priv, struct sk_buff *skb,
1495             const struct nf_hook_state *state)
1496{
1497        return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET);
1498}
1499
1500/*
1501 *      It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1502 *      Check if packet is reply for established ip_vs_conn.
1503 */
1504static unsigned int
1505ip_vs_local_reply4(void *priv, struct sk_buff *skb,
1506                   const struct nf_hook_state *state)
1507{
1508        return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET);
1509}
1510
1511#ifdef CONFIG_IP_VS_IPV6
1512
1513/*
1514 *      It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1515 *      used only for VS/NAT.
1516 *      Check if packet is reply for established ip_vs_conn.
1517 */
1518static unsigned int
1519ip_vs_reply6(void *priv, struct sk_buff *skb,
1520             const struct nf_hook_state *state)
1521{
1522        return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6);
1523}
1524
1525/*
1526 *      It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1527 *      Check if packet is reply for established ip_vs_conn.
1528 */
1529static unsigned int
1530ip_vs_local_reply6(void *priv, struct sk_buff *skb,
1531                   const struct nf_hook_state *state)
1532{
1533        return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6);
1534}
1535
1536#endif
1537
1538static unsigned int
1539ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
1540                      struct ip_vs_proto_data *pd,
1541                      int *verdict, struct ip_vs_conn **cpp,
1542                      struct ip_vs_iphdr *iph)
1543{
1544        struct ip_vs_protocol *pp = pd->pp;
1545
1546        if (!iph->fragoffs) {
1547                /* No (second) fragments need to enter here, as nf_defrag_ipv6
1548                 * replayed fragment zero will already have created the cp
1549                 */
1550
1551                /* Schedule and create new connection entry into cpp */
1552                if (!pp->conn_schedule(ipvs, af, skb, pd, verdict, cpp, iph))
1553                        return 0;
1554        }
1555
1556        if (unlikely(!*cpp)) {
1557                /* sorry, all this trouble for a no-hit :) */
1558                IP_VS_DBG_PKT(12, af, pp, skb, iph->off,
1559                              "ip_vs_in: packet continues traversal as normal");
1560
1561                /* Fragment couldn't be mapped to a conn entry */
1562                if (iph->fragoffs)
1563                        IP_VS_DBG_PKT(7, af, pp, skb, iph->off,
1564                                      "unhandled fragment");
1565
1566                *verdict = NF_ACCEPT;
1567                return 0;
1568        }
1569
1570        return 1;
1571}
1572
1573/* Check the UDP tunnel and return its header length */
1574static int ipvs_udp_decap(struct netns_ipvs *ipvs, struct sk_buff *skb,
1575                          unsigned int offset, __u16 af,
1576                          const union nf_inet_addr *daddr, __u8 *proto)
1577{
1578        struct udphdr _udph, *udph;
1579        struct ip_vs_dest *dest;
1580
1581        udph = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
1582        if (!udph)
1583                goto unk;
1584        offset += sizeof(struct udphdr);
1585        dest = ip_vs_find_tunnel(ipvs, af, daddr, udph->dest);
1586        if (!dest)
1587                goto unk;
1588        if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1589                struct guehdr _gueh, *gueh;
1590
1591                gueh = skb_header_pointer(skb, offset, sizeof(_gueh), &_gueh);
1592                if (!gueh)
1593                        goto unk;
1594                if (gueh->control != 0 || gueh->version != 0)
1595                        goto unk;
1596                /* Later we can support also IPPROTO_IPV6 */
1597                if (gueh->proto_ctype != IPPROTO_IPIP)
1598                        goto unk;
1599                *proto = gueh->proto_ctype;
1600                return sizeof(struct udphdr) + sizeof(struct guehdr) +
1601                       (gueh->hlen << 2);
1602        }
1603
1604unk:
1605        return 0;
1606}
1607
1608/* Check the GRE tunnel and return its header length */
1609static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb,
1610                          unsigned int offset, __u16 af,
1611                          const union nf_inet_addr *daddr, __u8 *proto)
1612{
1613        struct gre_base_hdr _greh, *greh;
1614        struct ip_vs_dest *dest;
1615
1616        greh = skb_header_pointer(skb, offset, sizeof(_greh), &_greh);
1617        if (!greh)
1618                goto unk;
1619        dest = ip_vs_find_tunnel(ipvs, af, daddr, 0);
1620        if (!dest)
1621                goto unk;
1622        if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1623                __be16 type;
1624
1625                /* Only support version 0 and C (csum) */
1626                if ((greh->flags & ~GRE_CSUM) != 0)
1627                        goto unk;
1628                type = greh->protocol;
1629                /* Later we can support also IPPROTO_IPV6 */
1630                if (type != htons(ETH_P_IP))
1631                        goto unk;
1632                *proto = IPPROTO_IPIP;
1633                return gre_calc_hlen(gre_flags_to_tnl_flags(greh->flags));
1634        }
1635
1636unk:
1637        return 0;
1638}
1639
1640/*
1641 *      Handle ICMP messages in the outside-to-inside direction (incoming).
1642 *      Find any that might be relevant, check against existing connections,
1643 *      forward to the right destination host if relevant.
1644 *      Currently handles error types - unreachable, quench, ttl exceeded.
1645 */
1646static int
1647ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
1648              unsigned int hooknum)
1649{
1650        struct iphdr *iph;
1651        struct icmphdr  _icmph, *ic;
1652        struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
1653        struct ip_vs_iphdr ciph;
1654        struct ip_vs_conn *cp;
1655        struct ip_vs_protocol *pp;
1656        struct ip_vs_proto_data *pd;
1657        unsigned int offset, offset2, ihl, verdict;
1658        bool tunnel, new_cp = false;
1659        union nf_inet_addr *raddr;
1660        char *outer_proto = "IPIP";
1661
1662        *related = 1;
1663
1664        /* reassemble IP fragments */
1665        if (ip_is_fragment(ip_hdr(skb))) {
1666                if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum)))
1667                        return NF_STOLEN;
1668        }
1669
1670        iph = ip_hdr(skb);
1671        offset = ihl = iph->ihl * 4;
1672        ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1673        if (ic == NULL)
1674                return NF_DROP;
1675
1676        IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
1677                  ic->type, ntohs(icmp_id(ic)),
1678                  &iph->saddr, &iph->daddr);
1679
1680        /*
1681         * Work through seeing if this is for us.
1682         * These checks are supposed to be in an order that means easy
1683         * things are checked first to speed up processing.... however
1684         * this means that some packets will manage to get a long way
1685         * down this stack and then be rejected, but that's life.
1686         */
1687        if ((ic->type != ICMP_DEST_UNREACH) &&
1688            (ic->type != ICMP_SOURCE_QUENCH) &&
1689            (ic->type != ICMP_TIME_EXCEEDED)) {
1690                *related = 0;
1691                return NF_ACCEPT;
1692        }
1693
1694        /* Now find the contained IP header */
1695        offset += sizeof(_icmph);
1696        cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1697        if (cih == NULL)
1698                return NF_ACCEPT; /* The packet looks wrong, ignore */
1699        raddr = (union nf_inet_addr *)&cih->daddr;
1700
1701        /* Special case for errors for IPIP/UDP/GRE tunnel packets */
1702        tunnel = false;
1703        if (cih->protocol == IPPROTO_IPIP) {
1704                struct ip_vs_dest *dest;
1705
1706                if (unlikely(cih->frag_off & htons(IP_OFFSET)))
1707                        return NF_ACCEPT;
1708                /* Error for our IPIP must arrive at LOCAL_IN */
1709                if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL))
1710                        return NF_ACCEPT;
1711                dest = ip_vs_find_tunnel(ipvs, AF_INET, raddr, 0);
1712                /* Only for known tunnel */
1713                if (!dest || dest->tun_type != IP_VS_CONN_F_TUNNEL_TYPE_IPIP)
1714                        return NF_ACCEPT;
1715                offset += cih->ihl * 4;
1716                cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1717                if (cih == NULL)
1718                        return NF_ACCEPT; /* The packet looks wrong, ignore */
1719                tunnel = true;
1720        } else if ((cih->protocol == IPPROTO_UDP ||     /* Can be UDP encap */
1721                    cih->protocol == IPPROTO_GRE) &&    /* Can be GRE encap */
1722                   /* Error for our tunnel must arrive at LOCAL_IN */
1723                   (skb_rtable(skb)->rt_flags & RTCF_LOCAL)) {
1724                __u8 iproto;
1725                int ulen;
1726
1727                /* Non-first fragment has no UDP/GRE header */
1728                if (unlikely(cih->frag_off & htons(IP_OFFSET)))
1729                        return NF_ACCEPT;
1730                offset2 = offset + cih->ihl * 4;
1731                if (cih->protocol == IPPROTO_UDP) {
1732                        ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET,
1733                                              raddr, &iproto);
1734                        outer_proto = "UDP";
1735                } else {
1736                        ulen = ipvs_gre_decap(ipvs, skb, offset2, AF_INET,
1737                                              raddr, &iproto);
1738                        outer_proto = "GRE";
1739                }
1740                if (ulen > 0) {
1741                        /* Skip IP and UDP/GRE tunnel headers */
1742                        offset = offset2 + ulen;
1743                        /* Now we should be at the original IP header */
1744                        cih = skb_header_pointer(skb, offset, sizeof(_ciph),
1745                                                 &_ciph);
1746                        if (cih && cih->version == 4 && cih->ihl >= 5 &&
1747                            iproto == IPPROTO_IPIP)
1748                                tunnel = true;
1749                        else
1750                                return NF_ACCEPT;
1751                }
1752        }
1753
1754        pd = ip_vs_proto_data_get(ipvs, cih->protocol);
1755        if (!pd)
1756                return NF_ACCEPT;
1757        pp = pd->pp;
1758
1759        /* Is the embedded protocol header present? */
1760        if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1761                     pp->dont_defrag))
1762                return NF_ACCEPT;
1763
1764        IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
1765                      "Checking incoming ICMP for");
1766
1767        offset2 = offset;
1768        ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !tunnel, &ciph);
1769        offset = ciph.len;
1770
1771        /* The embedded headers contain source and dest in reverse order.
1772         * For IPIP/UDP/GRE tunnel this is error for request, not for reply.
1773         */
1774        cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto,
1775                             ipvs, AF_INET, skb, &ciph);
1776
1777        if (!cp) {
1778                int v;
1779
1780                if (tunnel || !sysctl_schedule_icmp(ipvs))
1781                        return NF_ACCEPT;
1782
1783                if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph))
1784                        return v;
1785                new_cp = true;
1786        }
1787
1788        verdict = NF_DROP;
1789
1790        /* Ensure the checksum is correct */
1791        if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1792                /* Failed checksum! */
1793                IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
1794                          &iph->saddr);
1795                goto out;
1796        }
1797
1798        if (tunnel) {
1799                __be32 info = ic->un.gateway;
1800                __u8 type = ic->type;
1801                __u8 code = ic->code;
1802
1803                /* Update the MTU */
1804                if (ic->type == ICMP_DEST_UNREACH &&
1805                    ic->code == ICMP_FRAG_NEEDED) {
1806                        struct ip_vs_dest *dest = cp->dest;
1807                        u32 mtu = ntohs(ic->un.frag.mtu);
1808                        __be16 frag_off = cih->frag_off;
1809
1810                        /* Strip outer IP and ICMP, go to IPIP/UDP/GRE header */
1811                        if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL)
1812                                goto ignore_tunnel;
1813                        offset2 -= ihl + sizeof(_icmph);
1814                        skb_reset_network_header(skb);
1815                        IP_VS_DBG(12, "ICMP for %s %pI4->%pI4: mtu=%u\n",
1816                                  outer_proto, &ip_hdr(skb)->saddr,
1817                                  &ip_hdr(skb)->daddr, mtu);
1818                        ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0);
1819                        /* Client uses PMTUD? */
1820                        if (!(frag_off & htons(IP_DF)))
1821                                goto ignore_tunnel;
1822                        /* Prefer the resulting PMTU */
1823                        if (dest) {
1824                                struct ip_vs_dest_dst *dest_dst;
1825
1826                                dest_dst = rcu_dereference(dest->dest_dst);
1827                                if (dest_dst)
1828                                        mtu = dst_mtu(dest_dst->dst_cache);
1829                        }
1830                        if (mtu > 68 + sizeof(struct iphdr))
1831                                mtu -= sizeof(struct iphdr);
1832                        info = htonl(mtu);
1833                }
1834                /* Strip outer IP, ICMP and IPIP/UDP/GRE, go to IP header of
1835                 * original request.
1836                 */
1837                if (pskb_pull(skb, offset2) == NULL)
1838                        goto ignore_tunnel;
1839                skb_reset_network_header(skb);
1840                IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n",
1841                        &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1842                        type, code, ntohl(info));
1843                icmp_send(skb, type, code, info);
1844                /* ICMP can be shorter but anyways, account it */
1845                ip_vs_out_stats(cp, skb);
1846
1847ignore_tunnel:
1848                consume_skb(skb);
1849                verdict = NF_STOLEN;
1850                goto out;
1851        }
1852
1853        /* do the statistics and put it back */
1854        ip_vs_in_stats(cp, skb);
1855        if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol ||
1856            IPPROTO_SCTP == cih->protocol)
1857                offset += 2 * sizeof(__u16);
1858        verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph);
1859
1860out:
1861        if (likely(!new_cp))
1862                __ip_vs_conn_put(cp);
1863        else
1864                ip_vs_conn_put(cp);
1865
1866        return verdict;
1867}
1868
1869#ifdef CONFIG_IP_VS_IPV6
1870static int ip_vs_in_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb,
1871                            int *related, unsigned int hooknum,
1872                            struct ip_vs_iphdr *iph)
1873{
1874        struct icmp6hdr _icmph, *ic;
1875        struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */
1876        struct ip_vs_conn *cp;
1877        struct ip_vs_protocol *pp;
1878        struct ip_vs_proto_data *pd;
1879        unsigned int offset, verdict;
1880        bool new_cp = false;
1881
1882        *related = 1;
1883
1884        ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph);
1885        if (ic == NULL)
1886                return NF_DROP;
1887
1888        /*
1889         * Work through seeing if this is for us.
1890         * These checks are supposed to be in an order that means easy
1891         * things are checked first to speed up processing.... however
1892         * this means that some packets will manage to get a long way
1893         * down this stack and then be rejected, but that's life.
1894         */
1895        if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) {
1896                *related = 0;
1897                return NF_ACCEPT;
1898        }
1899        /* Fragment header that is before ICMP header tells us that:
1900         * it's not an error message since they can't be fragmented.
1901         */
1902        if (iph->flags & IP6_FH_F_FRAG)
1903                return NF_DROP;
1904
1905        IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n",
1906                  ic->icmp6_type, ntohs(icmpv6_id(ic)),
1907                  &iph->saddr, &iph->daddr);
1908
1909        offset = iph->len + sizeof(_icmph);
1910        if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, offset, true, &ciph))
1911                return NF_ACCEPT;
1912
1913        pd = ip_vs_proto_data_get(ipvs, ciph.protocol);
1914        if (!pd)
1915                return NF_ACCEPT;
1916        pp = pd->pp;
1917
1918        /* Cannot handle fragmented embedded protocol */
1919        if (ciph.fragoffs)
1920                return NF_ACCEPT;
1921
1922        IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
1923                      "Checking incoming ICMPv6 for");
1924
1925        /* The embedded headers contain source and dest in reverse order
1926         * if not from localhost
1927         */
1928        cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto,
1929                             ipvs, AF_INET6, skb, &ciph);
1930
1931        if (!cp) {
1932                int v;
1933
1934                if (!sysctl_schedule_icmp(ipvs))
1935                        return NF_ACCEPT;
1936
1937                if (!ip_vs_try_to_schedule(ipvs, AF_INET6, skb, pd, &v, &cp, &ciph))
1938                        return v;
1939
1940                new_cp = true;
1941        }
1942
1943        /* VS/TUN, VS/DR and LOCALNODE just let it go */
1944        if ((hooknum == NF_INET_LOCAL_OUT) &&
1945            (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) {
1946                verdict = NF_ACCEPT;
1947                goto out;
1948        }
1949
1950        /* do the statistics and put it back */
1951        ip_vs_in_stats(cp, skb);
1952
1953        /* Need to mangle contained IPv6 header in ICMPv6 packet */
1954        offset = ciph.len;
1955        if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol ||
1956            IPPROTO_SCTP == ciph.protocol)
1957                offset += 2 * sizeof(__u16); /* Also mangle ports */
1958
1959        verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum, &ciph);
1960
1961out:
1962        if (likely(!new_cp))
1963                __ip_vs_conn_put(cp);
1964        else
1965                ip_vs_conn_put(cp);
1966
1967        return verdict;
1968}
1969#endif
1970
1971
1972/*
1973 *      Check if it's for virtual services, look it up,
1974 *      and send it on its way...
1975 */
1976static unsigned int
1977ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af)
1978{
1979        struct ip_vs_iphdr iph;
1980        struct ip_vs_protocol *pp;
1981        struct ip_vs_proto_data *pd;
1982        struct ip_vs_conn *cp;
1983        int ret, pkts;
1984        int conn_reuse_mode;
1985        struct sock *sk;
1986
1987        /* Already marked as IPVS request or reply? */
1988        if (skb->ipvs_property)
1989                return NF_ACCEPT;
1990
1991        /*
1992         *      Big tappo:
1993         *      - remote client: only PACKET_HOST
1994         *      - route: used for struct net when skb->dev is unset
1995         */
1996        if (unlikely((skb->pkt_type != PACKET_HOST &&
1997                      hooknum != NF_INET_LOCAL_OUT) ||
1998                     !skb_dst(skb))) {
1999                ip_vs_fill_iph_skb(af, skb, false, &iph);
2000                IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
2001                              " ignored in hook %u\n",
2002                              skb->pkt_type, iph.protocol,
2003                              IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
2004                return NF_ACCEPT;
2005        }
2006        /* ipvs enabled in this netns ? */
2007        if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
2008                return NF_ACCEPT;
2009
2010        ip_vs_fill_iph_skb(af, skb, false, &iph);
2011
2012        /* Bad... Do not break raw sockets */
2013        sk = skb_to_full_sk(skb);
2014        if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
2015                     af == AF_INET)) {
2016
2017                if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)
2018                        return NF_ACCEPT;
2019        }
2020
2021#ifdef CONFIG_IP_VS_IPV6
2022        if (af == AF_INET6) {
2023                if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
2024                        int related;
2025                        int verdict = ip_vs_in_icmp_v6(ipvs, skb, &related,
2026                                                       hooknum, &iph);
2027
2028                        if (related)
2029                                return verdict;
2030                }
2031        } else
2032#endif
2033                if (unlikely(iph.protocol == IPPROTO_ICMP)) {
2034                        int related;
2035                        int verdict = ip_vs_in_icmp(ipvs, skb, &related,
2036                                                    hooknum);
2037
2038                        if (related)
2039                                return verdict;
2040                }
2041
2042        /* Protocol supported? */
2043        pd = ip_vs_proto_data_get(ipvs, iph.protocol);
2044        if (unlikely(!pd)) {
2045                /* The only way we'll see this packet again is if it's
2046                 * encapsulated, so mark it with ipvs_property=1 so we
2047                 * skip it if we're ignoring tunneled packets
2048                 */
2049                if (sysctl_ignore_tunneled(ipvs))
2050                        skb->ipvs_property = 1;
2051
2052                return NF_ACCEPT;
2053        }
2054        pp = pd->pp;
2055        /*
2056         * Check if the packet belongs to an existing connection entry
2057         */
2058        cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto,
2059                             ipvs, af, skb, &iph);
2060
2061        conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
2062        if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) {
2063                bool old_ct = false, resched = false;
2064
2065                if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
2066                    unlikely(!atomic_read(&cp->dest->weight))) {
2067                        resched = true;
2068                        old_ct = ip_vs_conn_uses_old_conntrack(cp, skb);
2069                } else if (is_new_conn_expected(cp, conn_reuse_mode)) {
2070                        old_ct = ip_vs_conn_uses_old_conntrack(cp, skb);
2071                        if (!atomic_read(&cp->n_control)) {
2072                                resched = true;
2073                        } else {
2074                                /* Do not reschedule controlling connection
2075                                 * that uses conntrack while it is still
2076                                 * referenced by controlled connection(s).
2077                                 */
2078                                resched = !old_ct;
2079                        }
2080                }
2081
2082                if (resched) {
2083                        if (!old_ct)
2084                                cp->flags &= ~IP_VS_CONN_F_NFCT;
2085                        if (!atomic_read(&cp->n_control))
2086                                ip_vs_conn_expire_now(cp);
2087                        __ip_vs_conn_put(cp);
2088                        if (old_ct)
2089                                return NF_DROP;
2090                        cp = NULL;
2091                }
2092        }
2093
2094        /* Check the server status */
2095        if (cp && cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
2096                /* the destination server is not available */
2097                if (sysctl_expire_nodest_conn(ipvs)) {
2098                        bool old_ct = ip_vs_conn_uses_old_conntrack(cp, skb);
2099
2100                        if (!old_ct)
2101                                cp->flags &= ~IP_VS_CONN_F_NFCT;
2102
2103                        ip_vs_conn_expire_now(cp);
2104                        __ip_vs_conn_put(cp);
2105                        if (old_ct)
2106                                return NF_DROP;
2107                        cp = NULL;
2108                } else {
2109                        __ip_vs_conn_put(cp);
2110                        return NF_DROP;
2111                }
2112        }
2113
2114        if (unlikely(!cp)) {
2115                int v;
2116
2117                if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph))
2118                        return v;
2119        }
2120
2121        IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet");
2122
2123        ip_vs_in_stats(cp, skb);
2124        ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
2125        if (cp->packet_xmit)
2126                ret = cp->packet_xmit(skb, cp, pp, &iph);
2127                /* do not touch skb anymore */
2128        else {
2129                IP_VS_DBG_RL("warning: packet_xmit is null");
2130                ret = NF_ACCEPT;
2131        }
2132
2133        /* Increase its packet counter and check if it is needed
2134         * to be synchronized
2135         *
2136         * Sync connection if it is about to close to
2137         * encorage the standby servers to update the connections timeout
2138         *
2139         * For ONE_PKT let ip_vs_sync_conn() do the filter work.
2140         */
2141
2142        if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
2143                pkts = sysctl_sync_threshold(ipvs);
2144        else
2145                pkts = atomic_add_return(1, &cp->in_pkts);
2146
2147        if (ipvs->sync_state & IP_VS_STATE_MASTER)
2148                ip_vs_sync_conn(ipvs, cp, pkts);
2149        else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control)
2150                /* increment is done inside ip_vs_sync_conn too */
2151                atomic_inc(&cp->control->in_pkts);
2152
2153        ip_vs_conn_put(cp);
2154        return ret;
2155}
2156
2157/*
2158 *      AF_INET handler in NF_INET_LOCAL_IN chain
2159 *      Schedule and forward packets from remote clients
2160 */
2161static unsigned int
2162ip_vs_remote_request4(void *priv, struct sk_buff *skb,
2163                      const struct nf_hook_state *state)
2164{
2165        return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET);
2166}
2167
2168/*
2169 *      AF_INET handler in NF_INET_LOCAL_OUT chain
2170 *      Schedule and forward packets from local clients
2171 */
2172static unsigned int
2173ip_vs_local_request4(void *priv, struct sk_buff *skb,
2174                     const struct nf_hook_state *state)
2175{
2176        return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET);
2177}
2178
2179#ifdef CONFIG_IP_VS_IPV6
2180
2181/*
2182 *      AF_INET6 handler in NF_INET_LOCAL_IN chain
2183 *      Schedule and forward packets from remote clients
2184 */
2185static unsigned int
2186ip_vs_remote_request6(void *priv, struct sk_buff *skb,
2187                      const struct nf_hook_state *state)
2188{
2189        return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6);
2190}
2191
2192/*
2193 *      AF_INET6 handler in NF_INET_LOCAL_OUT chain
2194 *      Schedule and forward packets from local clients
2195 */
2196static unsigned int
2197ip_vs_local_request6(void *priv, struct sk_buff *skb,
2198                     const struct nf_hook_state *state)
2199{
2200        return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6);
2201}
2202
2203#endif
2204
2205
2206/*
2207 *      It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
2208 *      related packets destined for 0.0.0.0/0.
2209 *      When fwmark-based virtual service is used, such as transparent
2210 *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
2211 *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
2212 *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
2213 *      and send them to ip_vs_in_icmp.
2214 */
2215static unsigned int
2216ip_vs_forward_icmp(void *priv, struct sk_buff *skb,
2217                   const struct nf_hook_state *state)
2218{
2219        int r;
2220        struct netns_ipvs *ipvs = net_ipvs(state->net);
2221
2222        if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
2223                return NF_ACCEPT;
2224
2225        /* ipvs enabled in this netns ? */
2226        if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
2227                return NF_ACCEPT;
2228
2229        return ip_vs_in_icmp(ipvs, skb, &r, state->hook);
2230}
2231
2232#ifdef CONFIG_IP_VS_IPV6
2233static unsigned int
2234ip_vs_forward_icmp_v6(void *priv, struct sk_buff *skb,
2235                      const struct nf_hook_state *state)
2236{
2237        int r;
2238        struct netns_ipvs *ipvs = net_ipvs(state->net);
2239        struct ip_vs_iphdr iphdr;
2240
2241        ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr);
2242        if (iphdr.protocol != IPPROTO_ICMPV6)
2243                return NF_ACCEPT;
2244
2245        /* ipvs enabled in this netns ? */
2246        if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
2247                return NF_ACCEPT;
2248
2249        return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr);
2250}
2251#endif
2252
2253
2254static const struct nf_hook_ops ip_vs_ops4[] = {
2255        /* After packet filtering, change source only for VS/NAT */
2256        {
2257                .hook           = ip_vs_reply4,
2258                .pf             = NFPROTO_IPV4,
2259                .hooknum        = NF_INET_LOCAL_IN,
2260                .priority       = NF_IP_PRI_NAT_SRC - 2,
2261        },
2262        /* After packet filtering, forward packet through VS/DR, VS/TUN,
2263         * or VS/NAT(change destination), so that filtering rules can be
2264         * applied to IPVS. */
2265        {
2266                .hook           = ip_vs_remote_request4,
2267                .pf             = NFPROTO_IPV4,
2268                .hooknum        = NF_INET_LOCAL_IN,
2269                .priority       = NF_IP_PRI_NAT_SRC - 1,
2270        },
2271        /* Before ip_vs_in, change source only for VS/NAT */
2272        {
2273                .hook           = ip_vs_local_reply4,
2274                .pf             = NFPROTO_IPV4,
2275                .hooknum        = NF_INET_LOCAL_OUT,
2276                .priority       = NF_IP_PRI_NAT_DST + 1,
2277        },
2278        /* After mangle, schedule and forward local requests */
2279        {
2280                .hook           = ip_vs_local_request4,
2281                .pf             = NFPROTO_IPV4,
2282                .hooknum        = NF_INET_LOCAL_OUT,
2283                .priority       = NF_IP_PRI_NAT_DST + 2,
2284        },
2285        /* After packet filtering (but before ip_vs_out_icmp), catch icmp
2286         * destined for 0.0.0.0/0, which is for incoming IPVS connections */
2287        {
2288                .hook           = ip_vs_forward_icmp,
2289                .pf             = NFPROTO_IPV4,
2290                .hooknum        = NF_INET_FORWARD,
2291                .priority       = 99,
2292        },
2293        /* After packet filtering, change source only for VS/NAT */
2294        {
2295                .hook           = ip_vs_reply4,
2296                .pf             = NFPROTO_IPV4,
2297                .hooknum        = NF_INET_FORWARD,
2298                .priority       = 100,
2299        },
2300};
2301
2302#ifdef CONFIG_IP_VS_IPV6
2303static const struct nf_hook_ops ip_vs_ops6[] = {
2304        /* After packet filtering, change source only for VS/NAT */
2305        {
2306                .hook           = ip_vs_reply6,
2307                .pf             = NFPROTO_IPV6,
2308                .hooknum        = NF_INET_LOCAL_IN,
2309                .priority       = NF_IP6_PRI_NAT_SRC - 2,
2310        },
2311        /* After packet filtering, forward packet through VS/DR, VS/TUN,
2312         * or VS/NAT(change destination), so that filtering rules can be
2313         * applied to IPVS. */
2314        {
2315                .hook           = ip_vs_remote_request6,
2316                .pf             = NFPROTO_IPV6,
2317                .hooknum        = NF_INET_LOCAL_IN,
2318                .priority       = NF_IP6_PRI_NAT_SRC - 1,
2319        },
2320        /* Before ip_vs_in, change source only for VS/NAT */
2321        {
2322                .hook           = ip_vs_local_reply6,
2323                .pf             = NFPROTO_IPV6,
2324                .hooknum        = NF_INET_LOCAL_OUT,
2325                .priority       = NF_IP6_PRI_NAT_DST + 1,
2326        },
2327        /* After mangle, schedule and forward local requests */
2328        {
2329                .hook           = ip_vs_local_request6,
2330                .pf             = NFPROTO_IPV6,
2331                .hooknum        = NF_INET_LOCAL_OUT,
2332                .priority       = NF_IP6_PRI_NAT_DST + 2,
2333        },
2334        /* After packet filtering (but before ip_vs_out_icmp), catch icmp
2335         * destined for 0.0.0.0/0, which is for incoming IPVS connections */
2336        {
2337                .hook           = ip_vs_forward_icmp_v6,
2338                .pf             = NFPROTO_IPV6,
2339                .hooknum        = NF_INET_FORWARD,
2340                .priority       = 99,
2341        },
2342        /* After packet filtering, change source only for VS/NAT */
2343        {
2344                .hook           = ip_vs_reply6,
2345                .pf             = NFPROTO_IPV6,
2346                .hooknum        = NF_INET_FORWARD,
2347                .priority       = 100,
2348        },
2349};
2350#endif
2351
2352int ip_vs_register_hooks(struct netns_ipvs *ipvs, unsigned int af)
2353{
2354        const struct nf_hook_ops *ops;
2355        unsigned int count;
2356        unsigned int afmask;
2357        int ret = 0;
2358
2359        if (af == AF_INET6) {
2360#ifdef CONFIG_IP_VS_IPV6
2361                ops = ip_vs_ops6;
2362                count = ARRAY_SIZE(ip_vs_ops6);
2363                afmask = 2;
2364#else
2365                return -EINVAL;
2366#endif
2367        } else {
2368                ops = ip_vs_ops4;
2369                count = ARRAY_SIZE(ip_vs_ops4);
2370                afmask = 1;
2371        }
2372
2373        if (!(ipvs->hooks_afmask & afmask)) {
2374                ret = nf_register_net_hooks(ipvs->net, ops, count);
2375                if (ret >= 0)
2376                        ipvs->hooks_afmask |= afmask;
2377        }
2378        return ret;
2379}
2380
2381void ip_vs_unregister_hooks(struct netns_ipvs *ipvs, unsigned int af)
2382{
2383        const struct nf_hook_ops *ops;
2384        unsigned int count;
2385        unsigned int afmask;
2386
2387        if (af == AF_INET6) {
2388#ifdef CONFIG_IP_VS_IPV6
2389                ops = ip_vs_ops6;
2390                count = ARRAY_SIZE(ip_vs_ops6);
2391                afmask = 2;
2392#else
2393                return;
2394#endif
2395        } else {
2396                ops = ip_vs_ops4;
2397                count = ARRAY_SIZE(ip_vs_ops4);
2398                afmask = 1;
2399        }
2400
2401        if (ipvs->hooks_afmask & afmask) {
2402                nf_unregister_net_hooks(ipvs->net, ops, count);
2403                ipvs->hooks_afmask &= ~afmask;
2404        }
2405}
2406
2407/*
2408 *      Initialize IP Virtual Server netns mem.
2409 */
2410static int __net_init __ip_vs_init(struct net *net)
2411{
2412        struct netns_ipvs *ipvs;
2413
2414        ipvs = net_generic(net, ip_vs_net_id);
2415        if (ipvs == NULL)
2416                return -ENOMEM;
2417
2418        /* Hold the beast until a service is registerd */
2419        ipvs->enable = 0;
2420        ipvs->net = net;
2421        /* Counters used for creating unique names */
2422        ipvs->gen = atomic_read(&ipvs_netns_cnt);
2423        atomic_inc(&ipvs_netns_cnt);
2424        net->ipvs = ipvs;
2425
2426        if (ip_vs_estimator_net_init(ipvs) < 0)
2427                goto estimator_fail;
2428
2429        if (ip_vs_control_net_init(ipvs) < 0)
2430                goto control_fail;
2431
2432        if (ip_vs_protocol_net_init(ipvs) < 0)
2433                goto protocol_fail;
2434
2435        if (ip_vs_app_net_init(ipvs) < 0)
2436                goto app_fail;
2437
2438        if (ip_vs_conn_net_init(ipvs) < 0)
2439                goto conn_fail;
2440
2441        if (ip_vs_sync_net_init(ipvs) < 0)
2442                goto sync_fail;
2443
2444        return 0;
2445/*
2446 * Error handling
2447 */
2448
2449sync_fail:
2450        ip_vs_conn_net_cleanup(ipvs);
2451conn_fail:
2452        ip_vs_app_net_cleanup(ipvs);
2453app_fail:
2454        ip_vs_protocol_net_cleanup(ipvs);
2455protocol_fail:
2456        ip_vs_control_net_cleanup(ipvs);
2457control_fail:
2458        ip_vs_estimator_net_cleanup(ipvs);
2459estimator_fail:
2460        net->ipvs = NULL;
2461        return -ENOMEM;
2462}
2463
2464static void __net_exit __ip_vs_cleanup_batch(struct list_head *net_list)
2465{
2466        struct netns_ipvs *ipvs;
2467        struct net *net;
2468
2469        ip_vs_service_nets_cleanup(net_list);   /* ip_vs_flush() with locks */
2470        list_for_each_entry(net, net_list, exit_list) {
2471                ipvs = net_ipvs(net);
2472                ip_vs_conn_net_cleanup(ipvs);
2473                ip_vs_app_net_cleanup(ipvs);
2474                ip_vs_protocol_net_cleanup(ipvs);
2475                ip_vs_control_net_cleanup(ipvs);
2476                ip_vs_estimator_net_cleanup(ipvs);
2477                IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen);
2478                net->ipvs = NULL;
2479        }
2480}
2481
2482static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list)
2483{
2484        struct netns_ipvs *ipvs;
2485        struct net *net;
2486
2487        EnterFunction(2);
2488        list_for_each_entry(net, net_list, exit_list) {
2489                ipvs = net_ipvs(net);
2490                ip_vs_unregister_hooks(ipvs, AF_INET);
2491                ip_vs_unregister_hooks(ipvs, AF_INET6);
2492                ipvs->enable = 0;       /* Disable packet reception */
2493                smp_wmb();
2494                ip_vs_sync_net_cleanup(ipvs);
2495        }
2496        LeaveFunction(2);
2497}
2498
2499static struct pernet_operations ipvs_core_ops = {
2500        .init = __ip_vs_init,
2501        .exit_batch = __ip_vs_cleanup_batch,
2502        .id   = &ip_vs_net_id,
2503        .size = sizeof(struct netns_ipvs),
2504};
2505
2506static struct pernet_operations ipvs_core_dev_ops = {
2507        .exit_batch = __ip_vs_dev_cleanup_batch,
2508};
2509
2510/*
2511 *      Initialize IP Virtual Server
2512 */
2513static int __init ip_vs_init(void)
2514{
2515        int ret;
2516
2517        ret = ip_vs_control_init();
2518        if (ret < 0) {
2519                pr_err("can't setup control.\n");
2520                goto exit;
2521        }
2522
2523        ip_vs_protocol_init();
2524
2525        ret = ip_vs_conn_init();
2526        if (ret < 0) {
2527                pr_err("can't setup connection table.\n");
2528                goto cleanup_protocol;
2529        }
2530
2531        ret = register_pernet_subsys(&ipvs_core_ops);   /* Alloc ip_vs struct */
2532        if (ret < 0)
2533                goto cleanup_conn;
2534
2535        ret = register_pernet_device(&ipvs_core_dev_ops);
2536        if (ret < 0)
2537                goto cleanup_sub;
2538
2539        ret = ip_vs_register_nl_ioctl();
2540        if (ret < 0) {
2541                pr_err("can't register netlink/ioctl.\n");
2542                goto cleanup_dev;
2543        }
2544
2545        pr_info("ipvs loaded.\n");
2546
2547        return ret;
2548
2549cleanup_dev:
2550        unregister_pernet_device(&ipvs_core_dev_ops);
2551cleanup_sub:
2552        unregister_pernet_subsys(&ipvs_core_ops);
2553cleanup_conn:
2554        ip_vs_conn_cleanup();
2555cleanup_protocol:
2556        ip_vs_protocol_cleanup();
2557        ip_vs_control_cleanup();
2558exit:
2559        return ret;
2560}
2561
2562static void __exit ip_vs_cleanup(void)
2563{
2564        ip_vs_unregister_nl_ioctl();
2565        unregister_pernet_device(&ipvs_core_dev_ops);
2566        unregister_pernet_subsys(&ipvs_core_ops);       /* free ip_vs struct */
2567        ip_vs_conn_cleanup();
2568        ip_vs_protocol_cleanup();
2569        ip_vs_control_cleanup();
2570        pr_info("ipvs unloaded.\n");
2571}
2572
2573module_init(ip_vs_init);
2574module_exit(ip_vs_cleanup);
2575MODULE_LICENSE("GPL");
2576