linux/net/netfilter/ipvs/ip_vs_core.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * IPVS         An implementation of the IP virtual server support for the
   4 *              LINUX operating system.  IPVS is now implemented as a module
   5 *              over the Netfilter framework. IPVS can be used to build a
   6 *              high-performance and highly available server based on a
   7 *              cluster of servers.
   8 *
   9 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
  10 *              Peter Kese <peter.kese@ijs.si>
  11 *              Julian Anastasov <ja@ssi.bg>
  12 *
  13 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
  14 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
  15 * and others.
  16 *
  17 * Changes:
  18 *      Paul `Rusty' Russell            properly handle non-linear skbs
  19 *      Harald Welte                    don't use nfcache
  20 */
  21
  22#define KMSG_COMPONENT "IPVS"
  23#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  24
  25#include <linux/module.h>
  26#include <linux/kernel.h>
  27#include <linux/ip.h>
  28#include <linux/tcp.h>
  29#include <linux/sctp.h>
  30#include <linux/icmp.h>
  31#include <linux/slab.h>
  32
  33#include <net/ip.h>
  34#include <net/tcp.h>
  35#include <net/udp.h>
  36#include <net/icmp.h>                   /* for icmp_send */
  37#include <net/gue.h>
  38#include <net/gre.h>
  39#include <net/route.h>
  40#include <net/ip6_checksum.h>
  41#include <net/netns/generic.h>          /* net_generic() */
  42
  43#include <linux/netfilter.h>
  44#include <linux/netfilter_ipv4.h>
  45
  46#ifdef CONFIG_IP_VS_IPV6
  47#include <net/ipv6.h>
  48#include <linux/netfilter_ipv6.h>
  49#include <net/ip6_route.h>
  50#endif
  51
  52#include <net/ip_vs.h>
  53#include <linux/indirect_call_wrapper.h>
  54
  55
  56EXPORT_SYMBOL(register_ip_vs_scheduler);
  57EXPORT_SYMBOL(unregister_ip_vs_scheduler);
  58EXPORT_SYMBOL(ip_vs_proto_name);
  59EXPORT_SYMBOL(ip_vs_conn_new);
  60EXPORT_SYMBOL(ip_vs_conn_in_get);
  61EXPORT_SYMBOL(ip_vs_conn_out_get);
  62#ifdef CONFIG_IP_VS_PROTO_TCP
  63EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
  64#endif
  65EXPORT_SYMBOL(ip_vs_conn_put);
  66#ifdef CONFIG_IP_VS_DEBUG
  67EXPORT_SYMBOL(ip_vs_get_debug_level);
  68#endif
  69EXPORT_SYMBOL(ip_vs_new_conn_out);
  70
  71#ifdef CONFIG_IP_VS_PROTO_TCP
  72INDIRECT_CALLABLE_DECLARE(int
  73        tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
  74                         struct ip_vs_conn *cp, struct ip_vs_iphdr *iph));
  75#endif
  76
  77#ifdef CONFIG_IP_VS_PROTO_UDP
  78INDIRECT_CALLABLE_DECLARE(int
  79        udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
  80                         struct ip_vs_conn *cp, struct ip_vs_iphdr *iph));
  81#endif
  82
  83#if defined(CONFIG_IP_VS_PROTO_TCP) && defined(CONFIG_IP_VS_PROTO_UDP)
  84#define SNAT_CALL(f, ...) \
  85        INDIRECT_CALL_2(f, tcp_snat_handler, udp_snat_handler, __VA_ARGS__)
  86#elif defined(CONFIG_IP_VS_PROTO_TCP)
  87#define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, tcp_snat_handler, __VA_ARGS__)
  88#elif defined(CONFIG_IP_VS_PROTO_UDP)
  89#define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, udp_snat_handler, __VA_ARGS__)
  90#else
  91#define SNAT_CALL(f, ...) f(__VA_ARGS__)
  92#endif
  93
  94static unsigned int ip_vs_net_id __read_mostly;
  95/* netns cnt used for uniqueness */
  96static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
  97
  98/* ID used in ICMP lookups */
  99#define icmp_id(icmph)          (((icmph)->un).echo.id)
 100#define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
 101
 102const char *ip_vs_proto_name(unsigned int proto)
 103{
 104        static char buf[20];
 105
 106        switch (proto) {
 107        case IPPROTO_IP:
 108                return "IP";
 109        case IPPROTO_UDP:
 110                return "UDP";
 111        case IPPROTO_TCP:
 112                return "TCP";
 113        case IPPROTO_SCTP:
 114                return "SCTP";
 115        case IPPROTO_ICMP:
 116                return "ICMP";
 117#ifdef CONFIG_IP_VS_IPV6
 118        case IPPROTO_ICMPV6:
 119                return "ICMPv6";
 120#endif
 121        default:
 122                sprintf(buf, "IP_%u", proto);
 123                return buf;
 124        }
 125}
 126
 127void ip_vs_init_hash_table(struct list_head *table, int rows)
 128{
 129        while (--rows >= 0)
 130                INIT_LIST_HEAD(&table[rows]);
 131}
 132
 133static inline void
 134ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 135{
 136        struct ip_vs_dest *dest = cp->dest;
 137        struct netns_ipvs *ipvs = cp->ipvs;
 138
 139        if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 140                struct ip_vs_cpu_stats *s;
 141                struct ip_vs_service *svc;
 142
 143                local_bh_disable();
 144
 145                s = this_cpu_ptr(dest->stats.cpustats);
 146                u64_stats_update_begin(&s->syncp);
 147                s->cnt.inpkts++;
 148                s->cnt.inbytes += skb->len;
 149                u64_stats_update_end(&s->syncp);
 150
 151                svc = rcu_dereference(dest->svc);
 152                s = this_cpu_ptr(svc->stats.cpustats);
 153                u64_stats_update_begin(&s->syncp);
 154                s->cnt.inpkts++;
 155                s->cnt.inbytes += skb->len;
 156                u64_stats_update_end(&s->syncp);
 157
 158                s = this_cpu_ptr(ipvs->tot_stats.cpustats);
 159                u64_stats_update_begin(&s->syncp);
 160                s->cnt.inpkts++;
 161                s->cnt.inbytes += skb->len;
 162                u64_stats_update_end(&s->syncp);
 163
 164                local_bh_enable();
 165        }
 166}
 167
 168
 169static inline void
 170ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 171{
 172        struct ip_vs_dest *dest = cp->dest;
 173        struct netns_ipvs *ipvs = cp->ipvs;
 174
 175        if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 176                struct ip_vs_cpu_stats *s;
 177                struct ip_vs_service *svc;
 178
 179                local_bh_disable();
 180
 181                s = this_cpu_ptr(dest->stats.cpustats);
 182                u64_stats_update_begin(&s->syncp);
 183                s->cnt.outpkts++;
 184                s->cnt.outbytes += skb->len;
 185                u64_stats_update_end(&s->syncp);
 186
 187                svc = rcu_dereference(dest->svc);
 188                s = this_cpu_ptr(svc->stats.cpustats);
 189                u64_stats_update_begin(&s->syncp);
 190                s->cnt.outpkts++;
 191                s->cnt.outbytes += skb->len;
 192                u64_stats_update_end(&s->syncp);
 193
 194                s = this_cpu_ptr(ipvs->tot_stats.cpustats);
 195                u64_stats_update_begin(&s->syncp);
 196                s->cnt.outpkts++;
 197                s->cnt.outbytes += skb->len;
 198                u64_stats_update_end(&s->syncp);
 199
 200                local_bh_enable();
 201        }
 202}
 203
 204
 205static inline void
 206ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
 207{
 208        struct netns_ipvs *ipvs = svc->ipvs;
 209        struct ip_vs_cpu_stats *s;
 210
 211        local_bh_disable();
 212
 213        s = this_cpu_ptr(cp->dest->stats.cpustats);
 214        u64_stats_update_begin(&s->syncp);
 215        s->cnt.conns++;
 216        u64_stats_update_end(&s->syncp);
 217
 218        s = this_cpu_ptr(svc->stats.cpustats);
 219        u64_stats_update_begin(&s->syncp);
 220        s->cnt.conns++;
 221        u64_stats_update_end(&s->syncp);
 222
 223        s = this_cpu_ptr(ipvs->tot_stats.cpustats);
 224        u64_stats_update_begin(&s->syncp);
 225        s->cnt.conns++;
 226        u64_stats_update_end(&s->syncp);
 227
 228        local_bh_enable();
 229}
 230
 231
 232static inline void
 233ip_vs_set_state(struct ip_vs_conn *cp, int direction,
 234                const struct sk_buff *skb,
 235                struct ip_vs_proto_data *pd)
 236{
 237        if (likely(pd->pp->state_transition))
 238                pd->pp->state_transition(cp, direction, skb, pd);
 239}
 240
 241static inline int
 242ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
 243                              struct sk_buff *skb, int protocol,
 244                              const union nf_inet_addr *caddr, __be16 cport,
 245                              const union nf_inet_addr *vaddr, __be16 vport,
 246                              struct ip_vs_conn_param *p)
 247{
 248        ip_vs_conn_fill_param(svc->ipvs, svc->af, protocol, caddr, cport, vaddr,
 249                              vport, p);
 250        p->pe = rcu_dereference(svc->pe);
 251        if (p->pe && p->pe->fill_param)
 252                return p->pe->fill_param(p, skb);
 253
 254        return 0;
 255}
 256
 257/*
 258 *  IPVS persistent scheduling function
 259 *  It creates a connection entry according to its template if exists,
 260 *  or selects a server and creates a connection entry plus a template.
 261 *  Locking: we are svc user (svc->refcnt), so we hold all dests too
 262 *  Protocols supported: TCP, UDP
 263 */
 264static struct ip_vs_conn *
 265ip_vs_sched_persist(struct ip_vs_service *svc,
 266                    struct sk_buff *skb, __be16 src_port, __be16 dst_port,
 267                    int *ignored, struct ip_vs_iphdr *iph)
 268{
 269        struct ip_vs_conn *cp = NULL;
 270        struct ip_vs_dest *dest;
 271        struct ip_vs_conn *ct;
 272        __be16 dport = 0;               /* destination port to forward */
 273        unsigned int flags;
 274        struct ip_vs_conn_param param;
 275        const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
 276        union nf_inet_addr snet;        /* source network of the client,
 277                                           after masking */
 278        const union nf_inet_addr *src_addr, *dst_addr;
 279
 280        if (likely(!ip_vs_iph_inverse(iph))) {
 281                src_addr = &iph->saddr;
 282                dst_addr = &iph->daddr;
 283        } else {
 284                src_addr = &iph->daddr;
 285                dst_addr = &iph->saddr;
 286        }
 287
 288
 289        /* Mask saddr with the netmask to adjust template granularity */
 290#ifdef CONFIG_IP_VS_IPV6
 291        if (svc->af == AF_INET6)
 292                ipv6_addr_prefix(&snet.in6, &src_addr->in6,
 293                                 (__force __u32) svc->netmask);
 294        else
 295#endif
 296                snet.ip = src_addr->ip & svc->netmask;
 297
 298        IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
 299                      "mnet %s\n",
 300                      IP_VS_DBG_ADDR(svc->af, src_addr), ntohs(src_port),
 301                      IP_VS_DBG_ADDR(svc->af, dst_addr), ntohs(dst_port),
 302                      IP_VS_DBG_ADDR(svc->af, &snet));
 303
 304        /*
 305         * As far as we know, FTP is a very complicated network protocol, and
 306         * it uses control connection and data connections. For active FTP,
 307         * FTP server initialize data connection to the client, its source port
 308         * is often 20. For passive FTP, FTP server tells the clients the port
 309         * that it passively listens to,  and the client issues the data
 310         * connection. In the tunneling or direct routing mode, the load
 311         * balancer is on the client-to-server half of connection, the port
 312         * number is unknown to the load balancer. So, a conn template like
 313         * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
 314         * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
 315         * is created for other persistent services.
 316         */
 317        {
 318                int protocol = iph->protocol;
 319                const union nf_inet_addr *vaddr = dst_addr;
 320                __be16 vport = 0;
 321
 322                if (dst_port == svc->port) {
 323                        /* non-FTP template:
 324                         * <protocol, caddr, 0, vaddr, vport, daddr, dport>
 325                         * FTP template:
 326                         * <protocol, caddr, 0, vaddr, 0, daddr, 0>
 327                         */
 328                        if (svc->port != FTPPORT)
 329                                vport = dst_port;
 330                } else {
 331                        /* Note: persistent fwmark-based services and
 332                         * persistent port zero service are handled here.
 333                         * fwmark template:
 334                         * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
 335                         * port zero template:
 336                         * <protocol,caddr,0,vaddr,0,daddr,0>
 337                         */
 338                        if (svc->fwmark) {
 339                                protocol = IPPROTO_IP;
 340                                vaddr = &fwmark;
 341                        }
 342                }
 343                /* return *ignored = -1 so NF_DROP can be used */
 344                if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
 345                                                  vaddr, vport, &param) < 0) {
 346                        *ignored = -1;
 347                        return NULL;
 348                }
 349        }
 350
 351        /* Check if a template already exists */
 352        ct = ip_vs_ct_in_get(&param);
 353        if (!ct || !ip_vs_check_template(ct, NULL)) {
 354                struct ip_vs_scheduler *sched;
 355
 356                /*
 357                 * No template found or the dest of the connection
 358                 * template is not available.
 359                 * return *ignored=0 i.e. ICMP and NF_DROP
 360                 */
 361                sched = rcu_dereference(svc->scheduler);
 362                if (sched) {
 363                        /* read svc->sched_data after svc->scheduler */
 364                        smp_rmb();
 365                        dest = sched->schedule(svc, skb, iph);
 366                } else {
 367                        dest = NULL;
 368                }
 369                if (!dest) {
 370                        IP_VS_DBG(1, "p-schedule: no dest found.\n");
 371                        kfree(param.pe_data);
 372                        *ignored = 0;
 373                        return NULL;
 374                }
 375
 376                if (dst_port == svc->port && svc->port != FTPPORT)
 377                        dport = dest->port;
 378
 379                /* Create a template
 380                 * This adds param.pe_data to the template,
 381                 * and thus param.pe_data will be destroyed
 382                 * when the template expires */
 383                ct = ip_vs_conn_new(&param, dest->af, &dest->addr, dport,
 384                                    IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
 385                if (ct == NULL) {
 386                        kfree(param.pe_data);
 387                        *ignored = -1;
 388                        return NULL;
 389                }
 390
 391                ct->timeout = svc->timeout;
 392        } else {
 393                /* set destination with the found template */
 394                dest = ct->dest;
 395                kfree(param.pe_data);
 396        }
 397
 398        dport = dst_port;
 399        if (dport == svc->port && dest->port)
 400                dport = dest->port;
 401
 402        flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
 403                 && iph->protocol == IPPROTO_UDP) ?
 404                IP_VS_CONN_F_ONE_PACKET : 0;
 405
 406        /*
 407         *    Create a new connection according to the template
 408         */
 409        ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, src_addr,
 410                              src_port, dst_addr, dst_port, &param);
 411
 412        cp = ip_vs_conn_new(&param, dest->af, &dest->addr, dport, flags, dest,
 413                            skb->mark);
 414        if (cp == NULL) {
 415                ip_vs_conn_put(ct);
 416                *ignored = -1;
 417                return NULL;
 418        }
 419
 420        /*
 421         *    Add its control
 422         */
 423        ip_vs_control_add(cp, ct);
 424        ip_vs_conn_put(ct);
 425
 426        ip_vs_conn_stats(cp, svc);
 427        return cp;
 428}
 429
 430
 431/*
 432 *  IPVS main scheduling function
 433 *  It selects a server according to the virtual service, and
 434 *  creates a connection entry.
 435 *  Protocols supported: TCP, UDP
 436 *
 437 *  Usage of *ignored
 438 *
 439 * 1 :   protocol tried to schedule (eg. on SYN), found svc but the
 440 *       svc/scheduler decides that this packet should be accepted with
 441 *       NF_ACCEPT because it must not be scheduled.
 442 *
 443 * 0 :   scheduler can not find destination, so try bypass or
 444 *       return ICMP and then NF_DROP (ip_vs_leave).
 445 *
 446 * -1 :  scheduler tried to schedule but fatal error occurred, eg.
 447 *       ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
 448 *       failure such as missing Call-ID, ENOMEM on skb_linearize
 449 *       or pe_data. In this case we should return NF_DROP without
 450 *       any attempts to send ICMP with ip_vs_leave.
 451 */
 452struct ip_vs_conn *
 453ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 454               struct ip_vs_proto_data *pd, int *ignored,
 455               struct ip_vs_iphdr *iph)
 456{
 457        struct ip_vs_protocol *pp = pd->pp;
 458        struct ip_vs_conn *cp = NULL;
 459        struct ip_vs_scheduler *sched;
 460        struct ip_vs_dest *dest;
 461        __be16 _ports[2], *pptr, cport, vport;
 462        const void *caddr, *vaddr;
 463        unsigned int flags;
 464
 465        *ignored = 1;
 466        /*
 467         * IPv6 frags, only the first hit here.
 468         */
 469        pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports);
 470        if (pptr == NULL)
 471                return NULL;
 472
 473        if (likely(!ip_vs_iph_inverse(iph))) {
 474                cport = pptr[0];
 475                caddr = &iph->saddr;
 476                vport = pptr[1];
 477                vaddr = &iph->daddr;
 478        } else {
 479                cport = pptr[1];
 480                caddr = &iph->daddr;
 481                vport = pptr[0];
 482                vaddr = &iph->saddr;
 483        }
 484
 485        /*
 486         * FTPDATA needs this check when using local real server.
 487         * Never schedule Active FTPDATA connections from real server.
 488         * For LVS-NAT they must be already created. For other methods
 489         * with persistence the connection is created on SYN+ACK.
 490         */
 491        if (cport == FTPDATA) {
 492                IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off,
 493                              "Not scheduling FTPDATA");
 494                return NULL;
 495        }
 496
 497        /*
 498         *    Do not schedule replies from local real server.
 499         */
 500        if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK)) {
 501                iph->hdr_flags ^= IP_VS_HDR_INVERSE;
 502                cp = INDIRECT_CALL_1(pp->conn_in_get,
 503                                     ip_vs_conn_in_get_proto, svc->ipvs,
 504                                     svc->af, skb, iph);
 505                iph->hdr_flags ^= IP_VS_HDR_INVERSE;
 506
 507                if (cp) {
 508                        IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off,
 509                                      "Not scheduling reply for existing"
 510                                      " connection");
 511                        __ip_vs_conn_put(cp);
 512                        return NULL;
 513                }
 514        }
 515
 516        /*
 517         *    Persistent service
 518         */
 519        if (svc->flags & IP_VS_SVC_F_PERSISTENT)
 520                return ip_vs_sched_persist(svc, skb, cport, vport, ignored,
 521                                           iph);
 522
 523        *ignored = 0;
 524
 525        /*
 526         *    Non-persistent service
 527         */
 528        if (!svc->fwmark && vport != svc->port) {
 529                if (!svc->port)
 530                        pr_err("Schedule: port zero only supported "
 531                               "in persistent services, "
 532                               "check your ipvs configuration\n");
 533                return NULL;
 534        }
 535
 536        sched = rcu_dereference(svc->scheduler);
 537        if (sched) {
 538                /* read svc->sched_data after svc->scheduler */
 539                smp_rmb();
 540                dest = sched->schedule(svc, skb, iph);
 541        } else {
 542                dest = NULL;
 543        }
 544        if (dest == NULL) {
 545                IP_VS_DBG(1, "Schedule: no dest found.\n");
 546                return NULL;
 547        }
 548
 549        flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
 550                 && iph->protocol == IPPROTO_UDP) ?
 551                IP_VS_CONN_F_ONE_PACKET : 0;
 552
 553        /*
 554         *    Create a connection entry.
 555         */
 556        {
 557                struct ip_vs_conn_param p;
 558
 559                ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol,
 560                                      caddr, cport, vaddr, vport, &p);
 561                cp = ip_vs_conn_new(&p, dest->af, &dest->addr,
 562                                    dest->port ? dest->port : vport,
 563                                    flags, dest, skb->mark);
 564                if (!cp) {
 565                        *ignored = -1;
 566                        return NULL;
 567                }
 568        }
 569
 570        IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
 571                      "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
 572                      ip_vs_fwd_tag(cp),
 573                      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
 574                      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
 575                      IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
 576                      cp->flags, refcount_read(&cp->refcnt));
 577
 578        ip_vs_conn_stats(cp, svc);
 579        return cp;
 580}
 581
 582static inline int ip_vs_addr_is_unicast(struct net *net, int af,
 583                                        union nf_inet_addr *addr)
 584{
 585#ifdef CONFIG_IP_VS_IPV6
 586        if (af == AF_INET6)
 587                return ipv6_addr_type(&addr->in6) & IPV6_ADDR_UNICAST;
 588#endif
 589        return (inet_addr_type(net, addr->ip) == RTN_UNICAST);
 590}
 591
 592/*
 593 *  Pass or drop the packet.
 594 *  Called by ip_vs_in, when the virtual service is available but
 595 *  no destination is available for a new connection.
 596 */
 597int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 598                struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph)
 599{
 600        __be16 _ports[2], *pptr, dport;
 601        struct netns_ipvs *ipvs = svc->ipvs;
 602        struct net *net = ipvs->net;
 603
 604        pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports);
 605        if (!pptr)
 606                return NF_DROP;
 607        dport = likely(!ip_vs_iph_inverse(iph)) ? pptr[1] : pptr[0];
 608
 609        /* if it is fwmark-based service, the cache_bypass sysctl is up
 610           and the destination is a non-local unicast, then create
 611           a cache_bypass connection entry */
 612        if (sysctl_cache_bypass(ipvs) && svc->fwmark &&
 613            !(iph->hdr_flags & (IP_VS_HDR_INVERSE | IP_VS_HDR_ICMP)) &&
 614            ip_vs_addr_is_unicast(net, svc->af, &iph->daddr)) {
 615                int ret;
 616                struct ip_vs_conn *cp;
 617                unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
 618                                      iph->protocol == IPPROTO_UDP) ?
 619                                      IP_VS_CONN_F_ONE_PACKET : 0;
 620                union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
 621
 622                /* create a new connection entry */
 623                IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
 624                {
 625                        struct ip_vs_conn_param p;
 626                        ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol,
 627                                              &iph->saddr, pptr[0],
 628                                              &iph->daddr, pptr[1], &p);
 629                        cp = ip_vs_conn_new(&p, svc->af, &daddr, 0,
 630                                            IP_VS_CONN_F_BYPASS | flags,
 631                                            NULL, skb->mark);
 632                        if (!cp)
 633                                return NF_DROP;
 634                }
 635
 636                /* statistics */
 637                ip_vs_in_stats(cp, skb);
 638
 639                /* set state */
 640                ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
 641
 642                /* transmit the first SYN packet */
 643                ret = cp->packet_xmit(skb, cp, pd->pp, iph);
 644                /* do not touch skb anymore */
 645
 646                if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control)
 647                        atomic_inc(&cp->control->in_pkts);
 648                else
 649                        atomic_inc(&cp->in_pkts);
 650                ip_vs_conn_put(cp);
 651                return ret;
 652        }
 653
 654        /*
 655         * When the virtual ftp service is presented, packets destined
 656         * for other services on the VIP may get here (except services
 657         * listed in the ipvs table), pass the packets, because it is
 658         * not ipvs job to decide to drop the packets.
 659         */
 660        if (svc->port == FTPPORT && dport != FTPPORT)
 661                return NF_ACCEPT;
 662
 663        if (unlikely(ip_vs_iph_icmp(iph)))
 664                return NF_DROP;
 665
 666        /*
 667         * Notify the client that the destination is unreachable, and
 668         * release the socket buffer.
 669         * Since it is in IP layer, the TCP socket is not actually
 670         * created, the TCP RST packet cannot be sent, instead that
 671         * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
 672         */
 673#ifdef CONFIG_IP_VS_IPV6
 674        if (svc->af == AF_INET6) {
 675                if (!skb->dev)
 676                        skb->dev = net->loopback_dev;
 677                icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
 678        } else
 679#endif
 680                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 681
 682        return NF_DROP;
 683}
 684
 685#ifdef CONFIG_SYSCTL
 686
 687static int sysctl_snat_reroute(struct netns_ipvs *ipvs)
 688{
 689        return ipvs->sysctl_snat_reroute;
 690}
 691
 692static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs)
 693{
 694        return ipvs->sysctl_nat_icmp_send;
 695}
 696
 697static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs)
 698{
 699        return ipvs->sysctl_expire_nodest_conn;
 700}
 701
 702#else
 703
 704static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; }
 705static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; }
 706static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; }
 707
 708#endif
 709
 710__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
 711{
 712        return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
 713}
 714
 715static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
 716{
 717        if (NF_INET_LOCAL_IN == hooknum)
 718                return IP_DEFRAG_VS_IN;
 719        if (NF_INET_FORWARD == hooknum)
 720                return IP_DEFRAG_VS_FWD;
 721        return IP_DEFRAG_VS_OUT;
 722}
 723
 724static inline int ip_vs_gather_frags(struct netns_ipvs *ipvs,
 725                                     struct sk_buff *skb, u_int32_t user)
 726{
 727        int err;
 728
 729        local_bh_disable();
 730        err = ip_defrag(ipvs->net, skb, user);
 731        local_bh_enable();
 732        if (!err)
 733                ip_send_check(ip_hdr(skb));
 734
 735        return err;
 736}
 737
 738static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af,
 739                                 struct sk_buff *skb, unsigned int hooknum)
 740{
 741        if (!sysctl_snat_reroute(ipvs))
 742                return 0;
 743        /* Reroute replies only to remote clients (FORWARD and LOCAL_OUT) */
 744        if (NF_INET_LOCAL_IN == hooknum)
 745                return 0;
 746#ifdef CONFIG_IP_VS_IPV6
 747        if (af == AF_INET6) {
 748                struct dst_entry *dst = skb_dst(skb);
 749
 750                if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) &&
 751                    ip6_route_me_harder(ipvs->net, skb) != 0)
 752                        return 1;
 753        } else
 754#endif
 755                if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
 756                    ip_route_me_harder(ipvs->net, skb, RTN_LOCAL) != 0)
 757                        return 1;
 758
 759        return 0;
 760}
 761
 762/*
 763 * Packet has been made sufficiently writable in caller
 764 * - inout: 1=in->out, 0=out->in
 765 */
 766void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
 767                    struct ip_vs_conn *cp, int inout)
 768{
 769        struct iphdr *iph        = ip_hdr(skb);
 770        unsigned int icmp_offset = iph->ihl*4;
 771        struct icmphdr *icmph    = (struct icmphdr *)(skb_network_header(skb) +
 772                                                      icmp_offset);
 773        struct iphdr *ciph       = (struct iphdr *)(icmph + 1);
 774
 775        if (inout) {
 776                iph->saddr = cp->vaddr.ip;
 777                ip_send_check(iph);
 778                ciph->daddr = cp->vaddr.ip;
 779                ip_send_check(ciph);
 780        } else {
 781                iph->daddr = cp->daddr.ip;
 782                ip_send_check(iph);
 783                ciph->saddr = cp->daddr.ip;
 784                ip_send_check(ciph);
 785        }
 786
 787        /* the TCP/UDP/SCTP port */
 788        if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol ||
 789            IPPROTO_SCTP == ciph->protocol) {
 790                __be16 *ports = (void *)ciph + ciph->ihl*4;
 791
 792                if (inout)
 793                        ports[1] = cp->vport;
 794                else
 795                        ports[0] = cp->dport;
 796        }
 797
 798        /* And finally the ICMP checksum */
 799        icmph->checksum = 0;
 800        icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
 801        skb->ip_summed = CHECKSUM_UNNECESSARY;
 802
 803        if (inout)
 804                IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
 805                        "Forwarding altered outgoing ICMP");
 806        else
 807                IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
 808                        "Forwarding altered incoming ICMP");
 809}
 810
 811#ifdef CONFIG_IP_VS_IPV6
 812void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
 813                    struct ip_vs_conn *cp, int inout)
 814{
 815        struct ipv6hdr *iph      = ipv6_hdr(skb);
 816        unsigned int icmp_offset = 0;
 817        unsigned int offs        = 0; /* header offset*/
 818        int protocol;
 819        struct icmp6hdr *icmph;
 820        struct ipv6hdr *ciph;
 821        unsigned short fragoffs;
 822
 823        ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL);
 824        icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset);
 825        offs = icmp_offset + sizeof(struct icmp6hdr);
 826        ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs);
 827
 828        protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL);
 829
 830        if (inout) {
 831                iph->saddr = cp->vaddr.in6;
 832                ciph->daddr = cp->vaddr.in6;
 833        } else {
 834                iph->daddr = cp->daddr.in6;
 835                ciph->saddr = cp->daddr.in6;
 836        }
 837
 838        /* the TCP/UDP/SCTP port */
 839        if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
 840                          IPPROTO_SCTP == protocol)) {
 841                __be16 *ports = (void *)(skb_network_header(skb) + offs);
 842
 843                IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__,
 844                              ntohs(inout ? ports[1] : ports[0]),
 845                              ntohs(inout ? cp->vport : cp->dport));
 846                if (inout)
 847                        ports[1] = cp->vport;
 848                else
 849                        ports[0] = cp->dport;
 850        }
 851
 852        /* And finally the ICMP checksum */
 853        icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr,
 854                                              skb->len - icmp_offset,
 855                                              IPPROTO_ICMPV6, 0);
 856        skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset;
 857        skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
 858        skb->ip_summed = CHECKSUM_PARTIAL;
 859
 860        if (inout)
 861                IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
 862                              (void *)ciph - (void *)iph,
 863                              "Forwarding altered outgoing ICMPv6");
 864        else
 865                IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
 866                              (void *)ciph - (void *)iph,
 867                              "Forwarding altered incoming ICMPv6");
 868}
 869#endif
 870
 871/* Handle relevant response ICMP messages - forward to the right
 872 * destination host.
 873 */
 874static int handle_response_icmp(int af, struct sk_buff *skb,
 875                                union nf_inet_addr *snet,
 876                                __u8 protocol, struct ip_vs_conn *cp,
 877                                struct ip_vs_protocol *pp,
 878                                unsigned int offset, unsigned int ihl,
 879                                unsigned int hooknum)
 880{
 881        unsigned int verdict = NF_DROP;
 882
 883        if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
 884                goto ignore_cp;
 885
 886        /* Ensure the checksum is correct */
 887        if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
 888                /* Failed checksum! */
 889                IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
 890                              IP_VS_DBG_ADDR(af, snet));
 891                goto out;
 892        }
 893
 894        if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
 895            IPPROTO_SCTP == protocol)
 896                offset += 2 * sizeof(__u16);
 897        if (skb_ensure_writable(skb, offset))
 898                goto out;
 899
 900#ifdef CONFIG_IP_VS_IPV6
 901        if (af == AF_INET6)
 902                ip_vs_nat_icmp_v6(skb, pp, cp, 1);
 903        else
 904#endif
 905                ip_vs_nat_icmp(skb, pp, cp, 1);
 906
 907        if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum))
 908                goto out;
 909
 910        /* do the statistics and put it back */
 911        ip_vs_out_stats(cp, skb);
 912
 913        skb->ipvs_property = 1;
 914        if (!(cp->flags & IP_VS_CONN_F_NFCT))
 915                ip_vs_notrack(skb);
 916        else
 917                ip_vs_update_conntrack(skb, cp, 0);
 918
 919ignore_cp:
 920        verdict = NF_ACCEPT;
 921
 922out:
 923        __ip_vs_conn_put(cp);
 924
 925        return verdict;
 926}
 927
 928/*
 929 *      Handle ICMP messages in the inside-to-outside direction (outgoing).
 930 *      Find any that might be relevant, check against existing connections.
 931 *      Currently handles error types - unreachable, quench, ttl exceeded.
 932 */
 933static int ip_vs_out_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb,
 934                          int *related, unsigned int hooknum)
 935{
 936        struct iphdr *iph;
 937        struct icmphdr  _icmph, *ic;
 938        struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
 939        struct ip_vs_iphdr ciph;
 940        struct ip_vs_conn *cp;
 941        struct ip_vs_protocol *pp;
 942        unsigned int offset, ihl;
 943        union nf_inet_addr snet;
 944
 945        *related = 1;
 946
 947        /* reassemble IP fragments */
 948        if (ip_is_fragment(ip_hdr(skb))) {
 949                if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum)))
 950                        return NF_STOLEN;
 951        }
 952
 953        iph = ip_hdr(skb);
 954        offset = ihl = iph->ihl * 4;
 955        ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
 956        if (ic == NULL)
 957                return NF_DROP;
 958
 959        IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
 960                  ic->type, ntohs(icmp_id(ic)),
 961                  &iph->saddr, &iph->daddr);
 962
 963        /*
 964         * Work through seeing if this is for us.
 965         * These checks are supposed to be in an order that means easy
 966         * things are checked first to speed up processing.... however
 967         * this means that some packets will manage to get a long way
 968         * down this stack and then be rejected, but that's life.
 969         */
 970        if ((ic->type != ICMP_DEST_UNREACH) &&
 971            (ic->type != ICMP_SOURCE_QUENCH) &&
 972            (ic->type != ICMP_TIME_EXCEEDED)) {
 973                *related = 0;
 974                return NF_ACCEPT;
 975        }
 976
 977        /* Now find the contained IP header */
 978        offset += sizeof(_icmph);
 979        cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
 980        if (cih == NULL)
 981                return NF_ACCEPT; /* The packet looks wrong, ignore */
 982
 983        pp = ip_vs_proto_get(cih->protocol);
 984        if (!pp)
 985                return NF_ACCEPT;
 986
 987        /* Is the embedded protocol header present? */
 988        if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
 989                     pp->dont_defrag))
 990                return NF_ACCEPT;
 991
 992        IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
 993                      "Checking outgoing ICMP for");
 994
 995        ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, true, &ciph);
 996
 997        /* The embedded headers contain source and dest in reverse order */
 998        cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto,
 999                             ipvs, AF_INET, skb, &ciph);
1000        if (!cp)
1001                return NF_ACCEPT;
1002
1003        snet.ip = iph->saddr;
1004        return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
1005                                    pp, ciph.len, ihl, hooknum);
1006}
1007
1008#ifdef CONFIG_IP_VS_IPV6
1009static int ip_vs_out_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb,
1010                             int *related,  unsigned int hooknum,
1011                             struct ip_vs_iphdr *ipvsh)
1012{
1013        struct icmp6hdr _icmph, *ic;
1014        struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */
1015        struct ip_vs_conn *cp;
1016        struct ip_vs_protocol *pp;
1017        union nf_inet_addr snet;
1018        unsigned int offset;
1019
1020        *related = 1;
1021        ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph);
1022        if (ic == NULL)
1023                return NF_DROP;
1024
1025        /*
1026         * Work through seeing if this is for us.
1027         * These checks are supposed to be in an order that means easy
1028         * things are checked first to speed up processing.... however
1029         * this means that some packets will manage to get a long way
1030         * down this stack and then be rejected, but that's life.
1031         */
1032        if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) {
1033                *related = 0;
1034                return NF_ACCEPT;
1035        }
1036        /* Fragment header that is before ICMP header tells us that:
1037         * it's not an error message since they can't be fragmented.
1038         */
1039        if (ipvsh->flags & IP6_FH_F_FRAG)
1040                return NF_DROP;
1041
1042        IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n",
1043                  ic->icmp6_type, ntohs(icmpv6_id(ic)),
1044                  &ipvsh->saddr, &ipvsh->daddr);
1045
1046        if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, ipvsh->len + sizeof(_icmph),
1047                                     true, &ciph))
1048                return NF_ACCEPT; /* The packet looks wrong, ignore */
1049
1050        pp = ip_vs_proto_get(ciph.protocol);
1051        if (!pp)
1052                return NF_ACCEPT;
1053
1054        /* The embedded headers contain source and dest in reverse order */
1055        cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto,
1056                             ipvs, AF_INET6, skb, &ciph);
1057        if (!cp)
1058                return NF_ACCEPT;
1059
1060        snet.in6 = ciph.saddr.in6;
1061        offset = ciph.len;
1062        return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp,
1063                                    pp, offset, sizeof(struct ipv6hdr),
1064                                    hooknum);
1065}
1066#endif
1067
1068/*
1069 * Check if sctp chunc is ABORT chunk
1070 */
1071static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len)
1072{
1073        struct sctp_chunkhdr *sch, schunk;
1074        sch = skb_header_pointer(skb, nh_len + sizeof(struct sctphdr),
1075                                 sizeof(schunk), &schunk);
1076        if (sch == NULL)
1077                return 0;
1078        if (sch->type == SCTP_CID_ABORT)
1079                return 1;
1080        return 0;
1081}
1082
1083static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
1084{
1085        struct tcphdr _tcph, *th;
1086
1087        th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
1088        if (th == NULL)
1089                return 0;
1090        return th->rst;
1091}
1092
1093static inline bool is_new_conn(const struct sk_buff *skb,
1094                               struct ip_vs_iphdr *iph)
1095{
1096        switch (iph->protocol) {
1097        case IPPROTO_TCP: {
1098                struct tcphdr _tcph, *th;
1099
1100                th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
1101                if (th == NULL)
1102                        return false;
1103                return th->syn;
1104        }
1105        case IPPROTO_SCTP: {
1106                struct sctp_chunkhdr *sch, schunk;
1107
1108                sch = skb_header_pointer(skb, iph->len + sizeof(struct sctphdr),
1109                                         sizeof(schunk), &schunk);
1110                if (sch == NULL)
1111                        return false;
1112                return sch->type == SCTP_CID_INIT;
1113        }
1114        default:
1115                return false;
1116        }
1117}
1118
1119static inline bool is_new_conn_expected(const struct ip_vs_conn *cp,
1120                                        int conn_reuse_mode)
1121{
1122        /* Controlled (FTP DATA or persistence)? */
1123        if (cp->control)
1124                return false;
1125
1126        switch (cp->protocol) {
1127        case IPPROTO_TCP:
1128                return (cp->state == IP_VS_TCP_S_TIME_WAIT) ||
1129                       (cp->state == IP_VS_TCP_S_CLOSE) ||
1130                        ((conn_reuse_mode & 2) &&
1131                         (cp->state == IP_VS_TCP_S_FIN_WAIT) &&
1132                         (cp->flags & IP_VS_CONN_F_NOOUTPUT));
1133        case IPPROTO_SCTP:
1134                return cp->state == IP_VS_SCTP_S_CLOSED;
1135        default:
1136                return false;
1137        }
1138}
1139
1140/* Generic function to create new connections for outgoing RS packets
1141 *
1142 * Pre-requisites for successful connection creation:
1143 * 1) Virtual Service is NOT fwmark based:
1144 *    In fwmark-VS actual vaddr and vport are unknown to IPVS
1145 * 2) Real Server and Virtual Service were NOT configured without port:
1146 *    This is to allow match of different VS to the same RS ip-addr
1147 */
1148struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
1149                                      struct ip_vs_dest *dest,
1150                                      struct sk_buff *skb,
1151                                      const struct ip_vs_iphdr *iph,
1152                                      __be16 dport,
1153                                      __be16 cport)
1154{
1155        struct ip_vs_conn_param param;
1156        struct ip_vs_conn *ct = NULL, *cp = NULL;
1157        const union nf_inet_addr *vaddr, *daddr, *caddr;
1158        union nf_inet_addr snet;
1159        __be16 vport;
1160        unsigned int flags;
1161
1162        EnterFunction(12);
1163        vaddr = &svc->addr;
1164        vport = svc->port;
1165        daddr = &iph->saddr;
1166        caddr = &iph->daddr;
1167
1168        /* check pre-requisites are satisfied */
1169        if (svc->fwmark)
1170                return NULL;
1171        if (!vport || !dport)
1172                return NULL;
1173
1174        /* for persistent service first create connection template */
1175        if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
1176                /* apply netmask the same way ingress-side does */
1177#ifdef CONFIG_IP_VS_IPV6
1178                if (svc->af == AF_INET6)
1179                        ipv6_addr_prefix(&snet.in6, &caddr->in6,
1180                                         (__force __u32)svc->netmask);
1181                else
1182#endif
1183                        snet.ip = caddr->ip & svc->netmask;
1184                /* fill params and create template if not existent */
1185                if (ip_vs_conn_fill_param_persist(svc, skb, iph->protocol,
1186                                                  &snet, 0, vaddr,
1187                                                  vport, &param) < 0)
1188                        return NULL;
1189                ct = ip_vs_ct_in_get(&param);
1190                /* check if template exists and points to the same dest */
1191                if (!ct || !ip_vs_check_template(ct, dest)) {
1192                        ct = ip_vs_conn_new(&param, dest->af, daddr, dport,
1193                                            IP_VS_CONN_F_TEMPLATE, dest, 0);
1194                        if (!ct) {
1195                                kfree(param.pe_data);
1196                                return NULL;
1197                        }
1198                        ct->timeout = svc->timeout;
1199                } else {
1200                        kfree(param.pe_data);
1201                }
1202        }
1203
1204        /* connection flags */
1205        flags = ((svc->flags & IP_VS_SVC_F_ONEPACKET) &&
1206                 iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0;
1207        /* create connection */
1208        ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol,
1209                              caddr, cport, vaddr, vport, &param);
1210        cp = ip_vs_conn_new(&param, dest->af, daddr, dport, flags, dest, 0);
1211        if (!cp) {
1212                if (ct)
1213                        ip_vs_conn_put(ct);
1214                return NULL;
1215        }
1216        if (ct) {
1217                ip_vs_control_add(cp, ct);
1218                ip_vs_conn_put(ct);
1219        }
1220        ip_vs_conn_stats(cp, svc);
1221
1222        /* return connection (will be used to handle outgoing packet) */
1223        IP_VS_DBG_BUF(6, "New connection RS-initiated:%c c:%s:%u v:%s:%u "
1224                      "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
1225                      ip_vs_fwd_tag(cp),
1226                      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
1227                      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
1228                      IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
1229                      cp->flags, refcount_read(&cp->refcnt));
1230        LeaveFunction(12);
1231        return cp;
1232}
1233
1234/* Handle outgoing packets which are considered requests initiated by
1235 * real servers, so that subsequent responses from external client can be
1236 * routed to the right real server.
1237 * Used also for outgoing responses in OPS mode.
1238 *
1239 * Connection management is handled by persistent-engine specific callback.
1240 */
1241static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum,
1242                                              struct netns_ipvs *ipvs,
1243                                              int af, struct sk_buff *skb,
1244                                              const struct ip_vs_iphdr *iph)
1245{
1246        struct ip_vs_dest *dest;
1247        struct ip_vs_conn *cp = NULL;
1248        __be16 _ports[2], *pptr;
1249
1250        if (hooknum == NF_INET_LOCAL_IN)
1251                return NULL;
1252
1253        pptr = frag_safe_skb_hp(skb, iph->len,
1254                                sizeof(_ports), _ports);
1255        if (!pptr)
1256                return NULL;
1257
1258        dest = ip_vs_find_real_service(ipvs, af, iph->protocol,
1259                                       &iph->saddr, pptr[0]);
1260        if (dest) {
1261                struct ip_vs_service *svc;
1262                struct ip_vs_pe *pe;
1263
1264                svc = rcu_dereference(dest->svc);
1265                if (svc) {
1266                        pe = rcu_dereference(svc->pe);
1267                        if (pe && pe->conn_out)
1268                                cp = pe->conn_out(svc, dest, skb, iph,
1269                                                  pptr[0], pptr[1]);
1270                }
1271        }
1272
1273        return cp;
1274}
1275
1276/* Handle response packets: rewrite addresses and send away...
1277 */
1278static unsigned int
1279handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
1280                struct ip_vs_conn *cp, struct ip_vs_iphdr *iph,
1281                unsigned int hooknum)
1282{
1283        struct ip_vs_protocol *pp = pd->pp;
1284
1285        IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet");
1286
1287        if (skb_ensure_writable(skb, iph->len))
1288                goto drop;
1289
1290        /* mangle the packet */
1291        if (pp->snat_handler &&
1292            !SNAT_CALL(pp->snat_handler, skb, pp, cp, iph))
1293                goto drop;
1294
1295#ifdef CONFIG_IP_VS_IPV6
1296        if (af == AF_INET6)
1297                ipv6_hdr(skb)->saddr = cp->vaddr.in6;
1298        else
1299#endif
1300        {
1301                ip_hdr(skb)->saddr = cp->vaddr.ip;
1302                ip_send_check(ip_hdr(skb));
1303        }
1304
1305        /*
1306         * nf_iterate does not expect change in the skb->dst->dev.
1307         * It looks like it is not fatal to enable this code for hooks
1308         * where our handlers are at the end of the chain list and
1309         * when all next handlers use skb->dst->dev and not outdev.
1310         * It will definitely route properly the inout NAT traffic
1311         * when multiple paths are used.
1312         */
1313
1314        /* For policy routing, packets originating from this
1315         * machine itself may be routed differently to packets
1316         * passing through.  We want this packet to be routed as
1317         * if it came from this machine itself.  So re-compute
1318         * the routing information.
1319         */
1320        if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum))
1321                goto drop;
1322
1323        IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT");
1324
1325        ip_vs_out_stats(cp, skb);
1326        ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
1327        skb->ipvs_property = 1;
1328        if (!(cp->flags & IP_VS_CONN_F_NFCT))
1329                ip_vs_notrack(skb);
1330        else
1331                ip_vs_update_conntrack(skb, cp, 0);
1332        ip_vs_conn_put(cp);
1333
1334        LeaveFunction(11);
1335        return NF_ACCEPT;
1336
1337drop:
1338        ip_vs_conn_put(cp);
1339        kfree_skb(skb);
1340        LeaveFunction(11);
1341        return NF_STOLEN;
1342}
1343
1344/*
1345 *      Check if outgoing packet belongs to the established ip_vs_conn.
1346 */
1347static unsigned int
1348ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af)
1349{
1350        struct ip_vs_iphdr iph;
1351        struct ip_vs_protocol *pp;
1352        struct ip_vs_proto_data *pd;
1353        struct ip_vs_conn *cp;
1354        struct sock *sk;
1355
1356        EnterFunction(11);
1357
1358        /* Already marked as IPVS request or reply? */
1359        if (skb->ipvs_property)
1360                return NF_ACCEPT;
1361
1362        sk = skb_to_full_sk(skb);
1363        /* Bad... Do not break raw sockets */
1364        if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
1365                     af == AF_INET)) {
1366
1367                if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)
1368                        return NF_ACCEPT;
1369        }
1370
1371        if (unlikely(!skb_dst(skb)))
1372                return NF_ACCEPT;
1373
1374        if (!ipvs->enable)
1375                return NF_ACCEPT;
1376
1377        ip_vs_fill_iph_skb(af, skb, false, &iph);
1378#ifdef CONFIG_IP_VS_IPV6
1379        if (af == AF_INET6) {
1380                if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1381                        int related;
1382                        int verdict = ip_vs_out_icmp_v6(ipvs, skb, &related,
1383                                                        hooknum, &iph);
1384
1385                        if (related)
1386                                return verdict;
1387                }
1388        } else
1389#endif
1390                if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1391                        int related;
1392                        int verdict = ip_vs_out_icmp(ipvs, skb, &related, hooknum);
1393
1394                        if (related)
1395                                return verdict;
1396                }
1397
1398        pd = ip_vs_proto_data_get(ipvs, iph.protocol);
1399        if (unlikely(!pd))
1400                return NF_ACCEPT;
1401        pp = pd->pp;
1402
1403        /* reassemble IP fragments */
1404#ifdef CONFIG_IP_VS_IPV6
1405        if (af == AF_INET)
1406#endif
1407                if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {
1408                        if (ip_vs_gather_frags(ipvs, skb,
1409                                               ip_vs_defrag_user(hooknum)))
1410                                return NF_STOLEN;
1411
1412                        ip_vs_fill_iph_skb(AF_INET, skb, false, &iph);
1413                }
1414
1415        /*
1416         * Check if the packet belongs to an existing entry
1417         */
1418        cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto,
1419                             ipvs, af, skb, &iph);
1420
1421        if (likely(cp)) {
1422                if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
1423                        goto ignore_cp;
1424                return handle_response(af, skb, pd, cp, &iph, hooknum);
1425        }
1426
1427        /* Check for real-server-started requests */
1428        if (atomic_read(&ipvs->conn_out_counter)) {
1429                /* Currently only for UDP:
1430                 * connection oriented protocols typically use
1431                 * ephemeral ports for outgoing connections, so
1432                 * related incoming responses would not match any VS
1433                 */
1434                if (pp->protocol == IPPROTO_UDP) {
1435                        cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph);
1436                        if (likely(cp))
1437                                return handle_response(af, skb, pd, cp, &iph,
1438                                                       hooknum);
1439                }
1440        }
1441
1442        if (sysctl_nat_icmp_send(ipvs) &&
1443            (pp->protocol == IPPROTO_TCP ||
1444             pp->protocol == IPPROTO_UDP ||
1445             pp->protocol == IPPROTO_SCTP)) {
1446                __be16 _ports[2], *pptr;
1447
1448                pptr = frag_safe_skb_hp(skb, iph.len,
1449                                         sizeof(_ports), _ports);
1450                if (pptr == NULL)
1451                        return NF_ACCEPT;       /* Not for me */
1452                if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr,
1453                                           pptr[0])) {
1454                        /*
1455                         * Notify the real server: there is no
1456                         * existing entry if it is not RST
1457                         * packet or not TCP packet.
1458                         */
1459                        if ((iph.protocol != IPPROTO_TCP &&
1460                             iph.protocol != IPPROTO_SCTP)
1461                             || ((iph.protocol == IPPROTO_TCP
1462                                  && !is_tcp_reset(skb, iph.len))
1463                                 || (iph.protocol == IPPROTO_SCTP
1464                                        && !is_sctp_abort(skb,
1465                                                iph.len)))) {
1466#ifdef CONFIG_IP_VS_IPV6
1467                                if (af == AF_INET6) {
1468                                        if (!skb->dev)
1469                                                skb->dev = ipvs->net->loopback_dev;
1470                                        icmpv6_send(skb,
1471                                                    ICMPV6_DEST_UNREACH,
1472                                                    ICMPV6_PORT_UNREACH,
1473                                                    0);
1474                                } else
1475#endif
1476                                        icmp_send(skb,
1477                                                  ICMP_DEST_UNREACH,
1478                                                  ICMP_PORT_UNREACH, 0);
1479                                return NF_DROP;
1480                        }
1481                }
1482        }
1483
1484out:
1485        IP_VS_DBG_PKT(12, af, pp, skb, iph.off,
1486                      "ip_vs_out: packet continues traversal as normal");
1487        return NF_ACCEPT;
1488
1489ignore_cp:
1490        __ip_vs_conn_put(cp);
1491        goto out;
1492}
1493
1494/*
1495 *      It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1496 *      used only for VS/NAT.
1497 *      Check if packet is reply for established ip_vs_conn.
1498 */
1499static unsigned int
1500ip_vs_reply4(void *priv, struct sk_buff *skb,
1501             const struct nf_hook_state *state)
1502{
1503        return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET);
1504}
1505
1506/*
1507 *      It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1508 *      Check if packet is reply for established ip_vs_conn.
1509 */
1510static unsigned int
1511ip_vs_local_reply4(void *priv, struct sk_buff *skb,
1512                   const struct nf_hook_state *state)
1513{
1514        return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET);
1515}
1516
1517#ifdef CONFIG_IP_VS_IPV6
1518
1519/*
1520 *      It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1521 *      used only for VS/NAT.
1522 *      Check if packet is reply for established ip_vs_conn.
1523 */
1524static unsigned int
1525ip_vs_reply6(void *priv, struct sk_buff *skb,
1526             const struct nf_hook_state *state)
1527{
1528        return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6);
1529}
1530
1531/*
1532 *      It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1533 *      Check if packet is reply for established ip_vs_conn.
1534 */
1535static unsigned int
1536ip_vs_local_reply6(void *priv, struct sk_buff *skb,
1537                   const struct nf_hook_state *state)
1538{
1539        return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6);
1540}
1541
1542#endif
1543
1544static unsigned int
1545ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
1546                      struct ip_vs_proto_data *pd,
1547                      int *verdict, struct ip_vs_conn **cpp,
1548                      struct ip_vs_iphdr *iph)
1549{
1550        struct ip_vs_protocol *pp = pd->pp;
1551
1552        if (!iph->fragoffs) {
1553                /* No (second) fragments need to enter here, as nf_defrag_ipv6
1554                 * replayed fragment zero will already have created the cp
1555                 */
1556
1557                /* Schedule and create new connection entry into cpp */
1558                if (!pp->conn_schedule(ipvs, af, skb, pd, verdict, cpp, iph))
1559                        return 0;
1560        }
1561
1562        if (unlikely(!*cpp)) {
1563                /* sorry, all this trouble for a no-hit :) */
1564                IP_VS_DBG_PKT(12, af, pp, skb, iph->off,
1565                              "ip_vs_in: packet continues traversal as normal");
1566
1567                /* Fragment couldn't be mapped to a conn entry */
1568                if (iph->fragoffs)
1569                        IP_VS_DBG_PKT(7, af, pp, skb, iph->off,
1570                                      "unhandled fragment");
1571
1572                *verdict = NF_ACCEPT;
1573                return 0;
1574        }
1575
1576        return 1;
1577}
1578
1579/* Check the UDP tunnel and return its header length */
1580static int ipvs_udp_decap(struct netns_ipvs *ipvs, struct sk_buff *skb,
1581                          unsigned int offset, __u16 af,
1582                          const union nf_inet_addr *daddr, __u8 *proto)
1583{
1584        struct udphdr _udph, *udph;
1585        struct ip_vs_dest *dest;
1586
1587        udph = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
1588        if (!udph)
1589                goto unk;
1590        offset += sizeof(struct udphdr);
1591        dest = ip_vs_find_tunnel(ipvs, af, daddr, udph->dest);
1592        if (!dest)
1593                goto unk;
1594        if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1595                struct guehdr _gueh, *gueh;
1596
1597                gueh = skb_header_pointer(skb, offset, sizeof(_gueh), &_gueh);
1598                if (!gueh)
1599                        goto unk;
1600                if (gueh->control != 0 || gueh->version != 0)
1601                        goto unk;
1602                /* Later we can support also IPPROTO_IPV6 */
1603                if (gueh->proto_ctype != IPPROTO_IPIP)
1604                        goto unk;
1605                *proto = gueh->proto_ctype;
1606                return sizeof(struct udphdr) + sizeof(struct guehdr) +
1607                       (gueh->hlen << 2);
1608        }
1609
1610unk:
1611        return 0;
1612}
1613
1614/* Check the GRE tunnel and return its header length */
1615static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb,
1616                          unsigned int offset, __u16 af,
1617                          const union nf_inet_addr *daddr, __u8 *proto)
1618{
1619        struct gre_base_hdr _greh, *greh;
1620        struct ip_vs_dest *dest;
1621
1622        greh = skb_header_pointer(skb, offset, sizeof(_greh), &_greh);
1623        if (!greh)
1624                goto unk;
1625        dest = ip_vs_find_tunnel(ipvs, af, daddr, 0);
1626        if (!dest)
1627                goto unk;
1628        if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1629                __be16 type;
1630
1631                /* Only support version 0 and C (csum) */
1632                if ((greh->flags & ~GRE_CSUM) != 0)
1633                        goto unk;
1634                type = greh->protocol;
1635                /* Later we can support also IPPROTO_IPV6 */
1636                if (type != htons(ETH_P_IP))
1637                        goto unk;
1638                *proto = IPPROTO_IPIP;
1639                return gre_calc_hlen(gre_flags_to_tnl_flags(greh->flags));
1640        }
1641
1642unk:
1643        return 0;
1644}
1645
1646/*
1647 *      Handle ICMP messages in the outside-to-inside direction (incoming).
1648 *      Find any that might be relevant, check against existing connections,
1649 *      forward to the right destination host if relevant.
1650 *      Currently handles error types - unreachable, quench, ttl exceeded.
1651 */
1652static int
1653ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
1654              unsigned int hooknum)
1655{
1656        struct iphdr *iph;
1657        struct icmphdr  _icmph, *ic;
1658        struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
1659        struct ip_vs_iphdr ciph;
1660        struct ip_vs_conn *cp;
1661        struct ip_vs_protocol *pp;
1662        struct ip_vs_proto_data *pd;
1663        unsigned int offset, offset2, ihl, verdict;
1664        bool ipip, new_cp = false;
1665        union nf_inet_addr *raddr;
1666
1667        *related = 1;
1668
1669        /* reassemble IP fragments */
1670        if (ip_is_fragment(ip_hdr(skb))) {
1671                if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum)))
1672                        return NF_STOLEN;
1673        }
1674
1675        iph = ip_hdr(skb);
1676        offset = ihl = iph->ihl * 4;
1677        ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1678        if (ic == NULL)
1679                return NF_DROP;
1680
1681        IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
1682                  ic->type, ntohs(icmp_id(ic)),
1683                  &iph->saddr, &iph->daddr);
1684
1685        /*
1686         * Work through seeing if this is for us.
1687         * These checks are supposed to be in an order that means easy
1688         * things are checked first to speed up processing.... however
1689         * this means that some packets will manage to get a long way
1690         * down this stack and then be rejected, but that's life.
1691         */
1692        if ((ic->type != ICMP_DEST_UNREACH) &&
1693            (ic->type != ICMP_SOURCE_QUENCH) &&
1694            (ic->type != ICMP_TIME_EXCEEDED)) {
1695                *related = 0;
1696                return NF_ACCEPT;
1697        }
1698
1699        /* Now find the contained IP header */
1700        offset += sizeof(_icmph);
1701        cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1702        if (cih == NULL)
1703                return NF_ACCEPT; /* The packet looks wrong, ignore */
1704        raddr = (union nf_inet_addr *)&cih->daddr;
1705
1706        /* Special case for errors for IPIP packets */
1707        ipip = false;
1708        if (cih->protocol == IPPROTO_IPIP) {
1709                struct ip_vs_dest *dest;
1710
1711                if (unlikely(cih->frag_off & htons(IP_OFFSET)))
1712                        return NF_ACCEPT;
1713                /* Error for our IPIP must arrive at LOCAL_IN */
1714                if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL))
1715                        return NF_ACCEPT;
1716                dest = ip_vs_find_tunnel(ipvs, AF_INET, raddr, 0);
1717                /* Only for known tunnel */
1718                if (!dest || dest->tun_type != IP_VS_CONN_F_TUNNEL_TYPE_IPIP)
1719                        return NF_ACCEPT;
1720                offset += cih->ihl * 4;
1721                cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1722                if (cih == NULL)
1723                        return NF_ACCEPT; /* The packet looks wrong, ignore */
1724                ipip = true;
1725        } else if ((cih->protocol == IPPROTO_UDP ||     /* Can be UDP encap */
1726                    cih->protocol == IPPROTO_GRE) &&    /* Can be GRE encap */
1727                   /* Error for our tunnel must arrive at LOCAL_IN */
1728                   (skb_rtable(skb)->rt_flags & RTCF_LOCAL)) {
1729                __u8 iproto;
1730                int ulen;
1731
1732                /* Non-first fragment has no UDP header */
1733                if (unlikely(cih->frag_off & htons(IP_OFFSET)))
1734                        return NF_ACCEPT;
1735                offset2 = offset + cih->ihl * 4;
1736                if (cih->protocol == IPPROTO_UDP)
1737                        ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET,
1738                                              raddr, &iproto);
1739                else
1740                        ulen = ipvs_gre_decap(ipvs, skb, offset2, AF_INET,
1741                                              raddr, &iproto);
1742                if (ulen > 0) {
1743                        /* Skip IP and UDP/GRE tunnel headers */
1744                        offset = offset2 + ulen;
1745                        /* Now we should be at the original IP header */
1746                        cih = skb_header_pointer(skb, offset, sizeof(_ciph),
1747                                                 &_ciph);
1748                        if (cih && cih->version == 4 && cih->ihl >= 5 &&
1749                            iproto == IPPROTO_IPIP)
1750                                ipip = true;
1751                        else
1752                                return NF_ACCEPT;
1753                }
1754        }
1755
1756        pd = ip_vs_proto_data_get(ipvs, cih->protocol);
1757        if (!pd)
1758                return NF_ACCEPT;
1759        pp = pd->pp;
1760
1761        /* Is the embedded protocol header present? */
1762        if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1763                     pp->dont_defrag))
1764                return NF_ACCEPT;
1765
1766        IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
1767                      "Checking incoming ICMP for");
1768
1769        offset2 = offset;
1770        ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !ipip, &ciph);
1771        offset = ciph.len;
1772
1773        /* The embedded headers contain source and dest in reverse order.
1774         * For IPIP this is error for request, not for reply.
1775         */
1776        cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto,
1777                             ipvs, AF_INET, skb, &ciph);
1778
1779        if (!cp) {
1780                int v;
1781
1782                if (ipip || !sysctl_schedule_icmp(ipvs))
1783                        return NF_ACCEPT;
1784
1785                if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph))
1786                        return v;
1787                new_cp = true;
1788        }
1789
1790        verdict = NF_DROP;
1791
1792        /* Ensure the checksum is correct */
1793        if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1794                /* Failed checksum! */
1795                IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
1796                          &iph->saddr);
1797                goto out;
1798        }
1799
1800        if (ipip) {
1801                __be32 info = ic->un.gateway;
1802                __u8 type = ic->type;
1803                __u8 code = ic->code;
1804
1805                /* Update the MTU */
1806                if (ic->type == ICMP_DEST_UNREACH &&
1807                    ic->code == ICMP_FRAG_NEEDED) {
1808                        struct ip_vs_dest *dest = cp->dest;
1809                        u32 mtu = ntohs(ic->un.frag.mtu);
1810                        __be16 frag_off = cih->frag_off;
1811
1812                        /* Strip outer IP and ICMP, go to IPIP header */
1813                        if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL)
1814                                goto ignore_ipip;
1815                        offset2 -= ihl + sizeof(_icmph);
1816                        skb_reset_network_header(skb);
1817                        IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n",
1818                                &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu);
1819                        ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0);
1820                        /* Client uses PMTUD? */
1821                        if (!(frag_off & htons(IP_DF)))
1822                                goto ignore_ipip;
1823                        /* Prefer the resulting PMTU */
1824                        if (dest) {
1825                                struct ip_vs_dest_dst *dest_dst;
1826
1827                                dest_dst = rcu_dereference(dest->dest_dst);
1828                                if (dest_dst)
1829                                        mtu = dst_mtu(dest_dst->dst_cache);
1830                        }
1831                        if (mtu > 68 + sizeof(struct iphdr))
1832                                mtu -= sizeof(struct iphdr);
1833                        info = htonl(mtu);
1834                }
1835                /* Strip outer IP, ICMP and IPIP, go to IP header of
1836                 * original request.
1837                 */
1838                if (pskb_pull(skb, offset2) == NULL)
1839                        goto ignore_ipip;
1840                skb_reset_network_header(skb);
1841                IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n",
1842                        &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1843                        type, code, ntohl(info));
1844                icmp_send(skb, type, code, info);
1845                /* ICMP can be shorter but anyways, account it */
1846                ip_vs_out_stats(cp, skb);
1847
1848ignore_ipip:
1849                consume_skb(skb);
1850                verdict = NF_STOLEN;
1851                goto out;
1852        }
1853
1854        /* do the statistics and put it back */
1855        ip_vs_in_stats(cp, skb);
1856        if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol ||
1857            IPPROTO_SCTP == cih->protocol)
1858                offset += 2 * sizeof(__u16);
1859        verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph);
1860
1861out:
1862        if (likely(!new_cp))
1863                __ip_vs_conn_put(cp);
1864        else
1865                ip_vs_conn_put(cp);
1866
1867        return verdict;
1868}
1869
1870#ifdef CONFIG_IP_VS_IPV6
1871static int ip_vs_in_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb,
1872                            int *related, unsigned int hooknum,
1873                            struct ip_vs_iphdr *iph)
1874{
1875        struct icmp6hdr _icmph, *ic;
1876        struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */
1877        struct ip_vs_conn *cp;
1878        struct ip_vs_protocol *pp;
1879        struct ip_vs_proto_data *pd;
1880        unsigned int offset, verdict;
1881        bool new_cp = false;
1882
1883        *related = 1;
1884
1885        ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph);
1886        if (ic == NULL)
1887                return NF_DROP;
1888
1889        /*
1890         * Work through seeing if this is for us.
1891         * These checks are supposed to be in an order that means easy
1892         * things are checked first to speed up processing.... however
1893         * this means that some packets will manage to get a long way
1894         * down this stack and then be rejected, but that's life.
1895         */
1896        if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) {
1897                *related = 0;
1898                return NF_ACCEPT;
1899        }
1900        /* Fragment header that is before ICMP header tells us that:
1901         * it's not an error message since they can't be fragmented.
1902         */
1903        if (iph->flags & IP6_FH_F_FRAG)
1904                return NF_DROP;
1905
1906        IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n",
1907                  ic->icmp6_type, ntohs(icmpv6_id(ic)),
1908                  &iph->saddr, &iph->daddr);
1909
1910        offset = iph->len + sizeof(_icmph);
1911        if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, offset, true, &ciph))
1912                return NF_ACCEPT;
1913
1914        pd = ip_vs_proto_data_get(ipvs, ciph.protocol);
1915        if (!pd)
1916                return NF_ACCEPT;
1917        pp = pd->pp;
1918
1919        /* Cannot handle fragmented embedded protocol */
1920        if (ciph.fragoffs)
1921                return NF_ACCEPT;
1922
1923        IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
1924                      "Checking incoming ICMPv6 for");
1925
1926        /* The embedded headers contain source and dest in reverse order
1927         * if not from localhost
1928         */
1929        cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto,
1930                             ipvs, AF_INET6, skb, &ciph);
1931
1932        if (!cp) {
1933                int v;
1934
1935                if (!sysctl_schedule_icmp(ipvs))
1936                        return NF_ACCEPT;
1937
1938                if (!ip_vs_try_to_schedule(ipvs, AF_INET6, skb, pd, &v, &cp, &ciph))
1939                        return v;
1940
1941                new_cp = true;
1942        }
1943
1944        /* VS/TUN, VS/DR and LOCALNODE just let it go */
1945        if ((hooknum == NF_INET_LOCAL_OUT) &&
1946            (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) {
1947                verdict = NF_ACCEPT;
1948                goto out;
1949        }
1950
1951        /* do the statistics and put it back */
1952        ip_vs_in_stats(cp, skb);
1953
1954        /* Need to mangle contained IPv6 header in ICMPv6 packet */
1955        offset = ciph.len;
1956        if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol ||
1957            IPPROTO_SCTP == ciph.protocol)
1958                offset += 2 * sizeof(__u16); /* Also mangle ports */
1959
1960        verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum, &ciph);
1961
1962out:
1963        if (likely(!new_cp))
1964                __ip_vs_conn_put(cp);
1965        else
1966                ip_vs_conn_put(cp);
1967
1968        return verdict;
1969}
1970#endif
1971
1972
1973/*
1974 *      Check if it's for virtual services, look it up,
1975 *      and send it on its way...
1976 */
1977static unsigned int
1978ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af)
1979{
1980        struct ip_vs_iphdr iph;
1981        struct ip_vs_protocol *pp;
1982        struct ip_vs_proto_data *pd;
1983        struct ip_vs_conn *cp;
1984        int ret, pkts;
1985        int conn_reuse_mode;
1986        struct sock *sk;
1987
1988        /* Already marked as IPVS request or reply? */
1989        if (skb->ipvs_property)
1990                return NF_ACCEPT;
1991
1992        /*
1993         *      Big tappo:
1994         *      - remote client: only PACKET_HOST
1995         *      - route: used for struct net when skb->dev is unset
1996         */
1997        if (unlikely((skb->pkt_type != PACKET_HOST &&
1998                      hooknum != NF_INET_LOCAL_OUT) ||
1999                     !skb_dst(skb))) {
2000                ip_vs_fill_iph_skb(af, skb, false, &iph);
2001                IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
2002                              " ignored in hook %u\n",
2003                              skb->pkt_type, iph.protocol,
2004                              IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
2005                return NF_ACCEPT;
2006        }
2007        /* ipvs enabled in this netns ? */
2008        if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
2009                return NF_ACCEPT;
2010
2011        ip_vs_fill_iph_skb(af, skb, false, &iph);
2012
2013        /* Bad... Do not break raw sockets */
2014        sk = skb_to_full_sk(skb);
2015        if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
2016                     af == AF_INET)) {
2017
2018                if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)
2019                        return NF_ACCEPT;
2020        }
2021
2022#ifdef CONFIG_IP_VS_IPV6
2023        if (af == AF_INET6) {
2024                if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
2025                        int related;
2026                        int verdict = ip_vs_in_icmp_v6(ipvs, skb, &related,
2027                                                       hooknum, &iph);
2028
2029                        if (related)
2030                                return verdict;
2031                }
2032        } else
2033#endif
2034                if (unlikely(iph.protocol == IPPROTO_ICMP)) {
2035                        int related;
2036                        int verdict = ip_vs_in_icmp(ipvs, skb, &related,
2037                                                    hooknum);
2038
2039                        if (related)
2040                                return verdict;
2041                }
2042
2043        /* Protocol supported? */
2044        pd = ip_vs_proto_data_get(ipvs, iph.protocol);
2045        if (unlikely(!pd)) {
2046                /* The only way we'll see this packet again is if it's
2047                 * encapsulated, so mark it with ipvs_property=1 so we
2048                 * skip it if we're ignoring tunneled packets
2049                 */
2050                if (sysctl_ignore_tunneled(ipvs))
2051                        skb->ipvs_property = 1;
2052
2053                return NF_ACCEPT;
2054        }
2055        pp = pd->pp;
2056        /*
2057         * Check if the packet belongs to an existing connection entry
2058         */
2059        cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto,
2060                             ipvs, af, skb, &iph);
2061
2062        conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
2063        if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) {
2064                bool uses_ct = false, resched = false;
2065
2066                if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
2067                    unlikely(!atomic_read(&cp->dest->weight))) {
2068                        resched = true;
2069                        uses_ct = ip_vs_conn_uses_conntrack(cp, skb);
2070                } else if (is_new_conn_expected(cp, conn_reuse_mode)) {
2071                        uses_ct = ip_vs_conn_uses_conntrack(cp, skb);
2072                        if (!atomic_read(&cp->n_control)) {
2073                                resched = true;
2074                        } else {
2075                                /* Do not reschedule controlling connection
2076                                 * that uses conntrack while it is still
2077                                 * referenced by controlled connection(s).
2078                                 */
2079                                resched = !uses_ct;
2080                        }
2081                }
2082
2083                if (resched) {
2084                        if (!atomic_read(&cp->n_control))
2085                                ip_vs_conn_expire_now(cp);
2086                        __ip_vs_conn_put(cp);
2087                        if (uses_ct)
2088                                return NF_DROP;
2089                        cp = NULL;
2090                }
2091        }
2092
2093        if (unlikely(!cp)) {
2094                int v;
2095
2096                if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph))
2097                        return v;
2098        }
2099
2100        IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet");
2101
2102        /* Check the server status */
2103        if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
2104                /* the destination server is not available */
2105
2106                __u32 flags = cp->flags;
2107
2108                /* when timer already started, silently drop the packet.*/
2109                if (timer_pending(&cp->timer))
2110                        __ip_vs_conn_put(cp);
2111                else
2112                        ip_vs_conn_put(cp);
2113
2114                if (sysctl_expire_nodest_conn(ipvs) &&
2115                    !(flags & IP_VS_CONN_F_ONE_PACKET)) {
2116                        /* try to expire the connection immediately */
2117                        ip_vs_conn_expire_now(cp);
2118                }
2119
2120                return NF_DROP;
2121        }
2122
2123        ip_vs_in_stats(cp, skb);
2124        ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
2125        if (cp->packet_xmit)
2126                ret = cp->packet_xmit(skb, cp, pp, &iph);
2127                /* do not touch skb anymore */
2128        else {
2129                IP_VS_DBG_RL("warning: packet_xmit is null");
2130                ret = NF_ACCEPT;
2131        }
2132
2133        /* Increase its packet counter and check if it is needed
2134         * to be synchronized
2135         *
2136         * Sync connection if it is about to close to
2137         * encorage the standby servers to update the connections timeout
2138         *
2139         * For ONE_PKT let ip_vs_sync_conn() do the filter work.
2140         */
2141
2142        if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
2143                pkts = sysctl_sync_threshold(ipvs);
2144        else
2145                pkts = atomic_add_return(1, &cp->in_pkts);
2146
2147        if (ipvs->sync_state & IP_VS_STATE_MASTER)
2148                ip_vs_sync_conn(ipvs, cp, pkts);
2149        else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control)
2150                /* increment is done inside ip_vs_sync_conn too */
2151                atomic_inc(&cp->control->in_pkts);
2152
2153        ip_vs_conn_put(cp);
2154        return ret;
2155}
2156
2157/*
2158 *      AF_INET handler in NF_INET_LOCAL_IN chain
2159 *      Schedule and forward packets from remote clients
2160 */
2161static unsigned int
2162ip_vs_remote_request4(void *priv, struct sk_buff *skb,
2163                      const struct nf_hook_state *state)
2164{
2165        return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET);
2166}
2167
2168/*
2169 *      AF_INET handler in NF_INET_LOCAL_OUT chain
2170 *      Schedule and forward packets from local clients
2171 */
2172static unsigned int
2173ip_vs_local_request4(void *priv, struct sk_buff *skb,
2174                     const struct nf_hook_state *state)
2175{
2176        return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET);
2177}
2178
2179#ifdef CONFIG_IP_VS_IPV6
2180
2181/*
2182 *      AF_INET6 handler in NF_INET_LOCAL_IN chain
2183 *      Schedule and forward packets from remote clients
2184 */
2185static unsigned int
2186ip_vs_remote_request6(void *priv, struct sk_buff *skb,
2187                      const struct nf_hook_state *state)
2188{
2189        return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6);
2190}
2191
2192/*
2193 *      AF_INET6 handler in NF_INET_LOCAL_OUT chain
2194 *      Schedule and forward packets from local clients
2195 */
2196static unsigned int
2197ip_vs_local_request6(void *priv, struct sk_buff *skb,
2198                     const struct nf_hook_state *state)
2199{
2200        return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6);
2201}
2202
2203#endif
2204
2205
2206/*
2207 *      It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
2208 *      related packets destined for 0.0.0.0/0.
2209 *      When fwmark-based virtual service is used, such as transparent
2210 *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
2211 *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
2212 *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
2213 *      and send them to ip_vs_in_icmp.
2214 */
2215static unsigned int
2216ip_vs_forward_icmp(void *priv, struct sk_buff *skb,
2217                   const struct nf_hook_state *state)
2218{
2219        int r;
2220        struct netns_ipvs *ipvs = net_ipvs(state->net);
2221
2222        if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
2223                return NF_ACCEPT;
2224
2225        /* ipvs enabled in this netns ? */
2226        if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
2227                return NF_ACCEPT;
2228
2229        return ip_vs_in_icmp(ipvs, skb, &r, state->hook);
2230}
2231
2232#ifdef CONFIG_IP_VS_IPV6
2233static unsigned int
2234ip_vs_forward_icmp_v6(void *priv, struct sk_buff *skb,
2235                      const struct nf_hook_state *state)
2236{
2237        int r;
2238        struct netns_ipvs *ipvs = net_ipvs(state->net);
2239        struct ip_vs_iphdr iphdr;
2240
2241        ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr);
2242        if (iphdr.protocol != IPPROTO_ICMPV6)
2243                return NF_ACCEPT;
2244
2245        /* ipvs enabled in this netns ? */
2246        if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
2247                return NF_ACCEPT;
2248
2249        return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr);
2250}
2251#endif
2252
2253
2254static const struct nf_hook_ops ip_vs_ops[] = {
2255        /* After packet filtering, change source only for VS/NAT */
2256        {
2257                .hook           = ip_vs_reply4,
2258                .pf             = NFPROTO_IPV4,
2259                .hooknum        = NF_INET_LOCAL_IN,
2260                .priority       = NF_IP_PRI_NAT_SRC - 2,
2261        },
2262        /* After packet filtering, forward packet through VS/DR, VS/TUN,
2263         * or VS/NAT(change destination), so that filtering rules can be
2264         * applied to IPVS. */
2265        {
2266                .hook           = ip_vs_remote_request4,
2267                .pf             = NFPROTO_IPV4,
2268                .hooknum        = NF_INET_LOCAL_IN,
2269                .priority       = NF_IP_PRI_NAT_SRC - 1,
2270        },
2271        /* Before ip_vs_in, change source only for VS/NAT */
2272        {
2273                .hook           = ip_vs_local_reply4,
2274                .pf             = NFPROTO_IPV4,
2275                .hooknum        = NF_INET_LOCAL_OUT,
2276                .priority       = NF_IP_PRI_NAT_DST + 1,
2277        },
2278        /* After mangle, schedule and forward local requests */
2279        {
2280                .hook           = ip_vs_local_request4,
2281                .pf             = NFPROTO_IPV4,
2282                .hooknum        = NF_INET_LOCAL_OUT,
2283                .priority       = NF_IP_PRI_NAT_DST + 2,
2284        },
2285        /* After packet filtering (but before ip_vs_out_icmp), catch icmp
2286         * destined for 0.0.0.0/0, which is for incoming IPVS connections */
2287        {
2288                .hook           = ip_vs_forward_icmp,
2289                .pf             = NFPROTO_IPV4,
2290                .hooknum        = NF_INET_FORWARD,
2291                .priority       = 99,
2292        },
2293        /* After packet filtering, change source only for VS/NAT */
2294        {
2295                .hook           = ip_vs_reply4,
2296                .pf             = NFPROTO_IPV4,
2297                .hooknum        = NF_INET_FORWARD,
2298                .priority       = 100,
2299        },
2300#ifdef CONFIG_IP_VS_IPV6
2301        /* After packet filtering, change source only for VS/NAT */
2302        {
2303                .hook           = ip_vs_reply6,
2304                .pf             = NFPROTO_IPV6,
2305                .hooknum        = NF_INET_LOCAL_IN,
2306                .priority       = NF_IP6_PRI_NAT_SRC - 2,
2307        },
2308        /* After packet filtering, forward packet through VS/DR, VS/TUN,
2309         * or VS/NAT(change destination), so that filtering rules can be
2310         * applied to IPVS. */
2311        {
2312                .hook           = ip_vs_remote_request6,
2313                .pf             = NFPROTO_IPV6,
2314                .hooknum        = NF_INET_LOCAL_IN,
2315                .priority       = NF_IP6_PRI_NAT_SRC - 1,
2316        },
2317        /* Before ip_vs_in, change source only for VS/NAT */
2318        {
2319                .hook           = ip_vs_local_reply6,
2320                .pf             = NFPROTO_IPV6,
2321                .hooknum        = NF_INET_LOCAL_OUT,
2322                .priority       = NF_IP6_PRI_NAT_DST + 1,
2323        },
2324        /* After mangle, schedule and forward local requests */
2325        {
2326                .hook           = ip_vs_local_request6,
2327                .pf             = NFPROTO_IPV6,
2328                .hooknum        = NF_INET_LOCAL_OUT,
2329                .priority       = NF_IP6_PRI_NAT_DST + 2,
2330        },
2331        /* After packet filtering (but before ip_vs_out_icmp), catch icmp
2332         * destined for 0.0.0.0/0, which is for incoming IPVS connections */
2333        {
2334                .hook           = ip_vs_forward_icmp_v6,
2335                .pf             = NFPROTO_IPV6,
2336                .hooknum        = NF_INET_FORWARD,
2337                .priority       = 99,
2338        },
2339        /* After packet filtering, change source only for VS/NAT */
2340        {
2341                .hook           = ip_vs_reply6,
2342                .pf             = NFPROTO_IPV6,
2343                .hooknum        = NF_INET_FORWARD,
2344                .priority       = 100,
2345        },
2346#endif
2347};
2348/*
2349 *      Initialize IP Virtual Server netns mem.
2350 */
2351static int __net_init __ip_vs_init(struct net *net)
2352{
2353        struct netns_ipvs *ipvs;
2354
2355        ipvs = net_generic(net, ip_vs_net_id);
2356        if (ipvs == NULL)
2357                return -ENOMEM;
2358
2359        /* Hold the beast until a service is registerd */
2360        ipvs->enable = 0;
2361        ipvs->net = net;
2362        /* Counters used for creating unique names */
2363        ipvs->gen = atomic_read(&ipvs_netns_cnt);
2364        atomic_inc(&ipvs_netns_cnt);
2365        net->ipvs = ipvs;
2366
2367        if (ip_vs_estimator_net_init(ipvs) < 0)
2368                goto estimator_fail;
2369
2370        if (ip_vs_control_net_init(ipvs) < 0)
2371                goto control_fail;
2372
2373        if (ip_vs_protocol_net_init(ipvs) < 0)
2374                goto protocol_fail;
2375
2376        if (ip_vs_app_net_init(ipvs) < 0)
2377                goto app_fail;
2378
2379        if (ip_vs_conn_net_init(ipvs) < 0)
2380                goto conn_fail;
2381
2382        if (ip_vs_sync_net_init(ipvs) < 0)
2383                goto sync_fail;
2384
2385        return 0;
2386/*
2387 * Error handling
2388 */
2389
2390sync_fail:
2391        ip_vs_conn_net_cleanup(ipvs);
2392conn_fail:
2393        ip_vs_app_net_cleanup(ipvs);
2394app_fail:
2395        ip_vs_protocol_net_cleanup(ipvs);
2396protocol_fail:
2397        ip_vs_control_net_cleanup(ipvs);
2398control_fail:
2399        ip_vs_estimator_net_cleanup(ipvs);
2400estimator_fail:
2401        net->ipvs = NULL;
2402        return -ENOMEM;
2403}
2404
2405static void __net_exit __ip_vs_cleanup(struct net *net)
2406{
2407        struct netns_ipvs *ipvs = net_ipvs(net);
2408
2409        ip_vs_service_net_cleanup(ipvs);        /* ip_vs_flush() with locks */
2410        ip_vs_conn_net_cleanup(ipvs);
2411        ip_vs_app_net_cleanup(ipvs);
2412        ip_vs_protocol_net_cleanup(ipvs);
2413        ip_vs_control_net_cleanup(ipvs);
2414        ip_vs_estimator_net_cleanup(ipvs);
2415        IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen);
2416        net->ipvs = NULL;
2417}
2418
2419static int __net_init __ip_vs_dev_init(struct net *net)
2420{
2421        int ret;
2422
2423        ret = nf_register_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
2424        if (ret < 0)
2425                goto hook_fail;
2426        return 0;
2427
2428hook_fail:
2429        return ret;
2430}
2431
2432static void __net_exit __ip_vs_dev_cleanup(struct net *net)
2433{
2434        struct netns_ipvs *ipvs = net_ipvs(net);
2435        EnterFunction(2);
2436        nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
2437        ipvs->enable = 0;       /* Disable packet reception */
2438        smp_wmb();
2439        ip_vs_sync_net_cleanup(ipvs);
2440        LeaveFunction(2);
2441}
2442
2443static struct pernet_operations ipvs_core_ops = {
2444        .init = __ip_vs_init,
2445        .exit = __ip_vs_cleanup,
2446        .id   = &ip_vs_net_id,
2447        .size = sizeof(struct netns_ipvs),
2448};
2449
2450static struct pernet_operations ipvs_core_dev_ops = {
2451        .init = __ip_vs_dev_init,
2452        .exit = __ip_vs_dev_cleanup,
2453};
2454
2455/*
2456 *      Initialize IP Virtual Server
2457 */
2458static int __init ip_vs_init(void)
2459{
2460        int ret;
2461
2462        ret = ip_vs_control_init();
2463        if (ret < 0) {
2464                pr_err("can't setup control.\n");
2465                goto exit;
2466        }
2467
2468        ip_vs_protocol_init();
2469
2470        ret = ip_vs_conn_init();
2471        if (ret < 0) {
2472                pr_err("can't setup connection table.\n");
2473                goto cleanup_protocol;
2474        }
2475
2476        ret = register_pernet_subsys(&ipvs_core_ops);   /* Alloc ip_vs struct */
2477        if (ret < 0)
2478                goto cleanup_conn;
2479
2480        ret = register_pernet_device(&ipvs_core_dev_ops);
2481        if (ret < 0)
2482                goto cleanup_sub;
2483
2484        ret = ip_vs_register_nl_ioctl();
2485        if (ret < 0) {
2486                pr_err("can't register netlink/ioctl.\n");
2487                goto cleanup_dev;
2488        }
2489
2490        pr_info("ipvs loaded.\n");
2491
2492        return ret;
2493
2494cleanup_dev:
2495        unregister_pernet_device(&ipvs_core_dev_ops);
2496cleanup_sub:
2497        unregister_pernet_subsys(&ipvs_core_ops);
2498cleanup_conn:
2499        ip_vs_conn_cleanup();
2500cleanup_protocol:
2501        ip_vs_protocol_cleanup();
2502        ip_vs_control_cleanup();
2503exit:
2504        return ret;
2505}
2506
2507static void __exit ip_vs_cleanup(void)
2508{
2509        ip_vs_unregister_nl_ioctl();
2510        unregister_pernet_device(&ipvs_core_dev_ops);
2511        unregister_pernet_subsys(&ipvs_core_ops);       /* free ip_vs struct */
2512        ip_vs_conn_cleanup();
2513        ip_vs_protocol_cleanup();
2514        ip_vs_control_cleanup();
2515        pr_info("ipvs unloaded.\n");
2516}
2517
2518module_init(ip_vs_init);
2519module_exit(ip_vs_cleanup);
2520MODULE_LICENSE("GPL");
2521