linux/net/netfilter/ipvs/ip_vs_proto_tcp.c
<<
>>
Prefs
   1/*
   2 * ip_vs_proto_tcp.c:   TCP load balancing support for IPVS
   3 *
   4 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
   5 *              Julian Anastasov <ja@ssi.bg>
   6 *
   7 *              This program is free software; you can redistribute it and/or
   8 *              modify it under the terms of the GNU General Public License
   9 *              as published by the Free Software Foundation; either version
  10 *              2 of the License, or (at your option) any later version.
  11 *
  12 * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>
  13 *
  14 *              Network name space (netns) aware.
  15 *              Global data moved to netns i.e struct netns_ipvs
  16 *              tcp_timeouts table has copy per netns in a hash table per
  17 *              protocol ip_vs_proto_data and is handled by netns
  18 */
  19
  20#define KMSG_COMPONENT "IPVS"
  21#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  22
  23#include <linux/kernel.h>
  24#include <linux/ip.h>
  25#include <linux/tcp.h>                  /* for tcphdr */
  26#include <net/ip.h>
  27#include <net/tcp.h>                    /* for csum_tcpudp_magic */
  28#include <net/ip6_checksum.h>
  29#include <linux/netfilter.h>
  30#include <linux/netfilter_ipv4.h>
  31
  32#include <net/ip_vs.h>
  33
  34static int
  35tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
  36                  int *verdict, struct ip_vs_conn **cpp,
  37                  struct ip_vs_iphdr *iph)
  38{
  39        struct net *net;
  40        struct ip_vs_service *svc;
  41        struct tcphdr _tcph, *th;
  42        struct netns_ipvs *ipvs;
  43
  44        th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
  45        if (th == NULL) {
  46                *verdict = NF_DROP;
  47                return 0;
  48        }
  49        net = skb_net(skb);
  50        ipvs = net_ipvs(net);
  51        /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
  52        rcu_read_lock();
  53        if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst &&
  54            (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
  55                                      &iph->daddr, th->dest))) {
  56                int ignored;
  57
  58                if (ip_vs_todrop(ipvs)) {
  59                        /*
  60                         * It seems that we are very loaded.
  61                         * We have to drop this packet :(
  62                         */
  63                        rcu_read_unlock();
  64                        *verdict = NF_DROP;
  65                        return 0;
  66                }
  67
  68                /*
  69                 * Let the virtual server select a real server for the
  70                 * incoming connection, and create a connection entry.
  71                 */
  72                *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
  73                if (!*cpp && ignored <= 0) {
  74                        if (!ignored)
  75                                *verdict = ip_vs_leave(svc, skb, pd, iph);
  76                        else
  77                                *verdict = NF_DROP;
  78                        rcu_read_unlock();
  79                        return 0;
  80                }
  81        }
  82        rcu_read_unlock();
  83        /* NF_ACCEPT */
  84        return 1;
  85}
  86
  87
  88static inline void
  89tcp_fast_csum_update(int af, struct tcphdr *tcph,
  90                     const union nf_inet_addr *oldip,
  91                     const union nf_inet_addr *newip,
  92                     __be16 oldport, __be16 newport)
  93{
  94#ifdef CONFIG_IP_VS_IPV6
  95        if (af == AF_INET6)
  96                tcph->check =
  97                        csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
  98                                         ip_vs_check_diff2(oldport, newport,
  99                                                ~csum_unfold(tcph->check))));
 100        else
 101#endif
 102        tcph->check =
 103                csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
 104                                 ip_vs_check_diff2(oldport, newport,
 105                                                ~csum_unfold(tcph->check))));
 106}
 107
 108
 109static inline void
 110tcp_partial_csum_update(int af, struct tcphdr *tcph,
 111                     const union nf_inet_addr *oldip,
 112                     const union nf_inet_addr *newip,
 113                     __be16 oldlen, __be16 newlen)
 114{
 115#ifdef CONFIG_IP_VS_IPV6
 116        if (af == AF_INET6)
 117                tcph->check =
 118                        ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
 119                                         ip_vs_check_diff2(oldlen, newlen,
 120                                                csum_unfold(tcph->check))));
 121        else
 122#endif
 123        tcph->check =
 124                ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
 125                                ip_vs_check_diff2(oldlen, newlen,
 126                                                csum_unfold(tcph->check))));
 127}
 128
 129
 130static int
 131tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
 132                 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
 133{
 134        struct tcphdr *tcph;
 135        unsigned int tcphoff = iph->len;
 136        int oldlen;
 137        int payload_csum = 0;
 138
 139#ifdef CONFIG_IP_VS_IPV6
 140        if (cp->af == AF_INET6 && iph->fragoffs)
 141                return 1;
 142#endif
 143        oldlen = skb->len - tcphoff;
 144
 145        /* csum_check requires unshared skb */
 146        if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
 147                return 0;
 148
 149        if (unlikely(cp->app != NULL)) {
 150                int ret;
 151
 152                /* Some checks before mangling */
 153                if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
 154                        return 0;
 155
 156                /* Call application helper if needed */
 157                if (!(ret = ip_vs_app_pkt_out(cp, skb)))
 158                        return 0;
 159                /* ret=2: csum update is needed after payload mangling */
 160                if (ret == 1)
 161                        oldlen = skb->len - tcphoff;
 162                else
 163                        payload_csum = 1;
 164        }
 165
 166        tcph = (void *)skb_network_header(skb) + tcphoff;
 167        tcph->source = cp->vport;
 168
 169        /* Adjust TCP checksums */
 170        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 171                tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
 172                                        htons(oldlen),
 173                                        htons(skb->len - tcphoff));
 174        } else if (!payload_csum) {
 175                /* Only port and addr are changed, do fast csum update */
 176                tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
 177                                     cp->dport, cp->vport);
 178                if (skb->ip_summed == CHECKSUM_COMPLETE)
 179                        skb->ip_summed = (cp->app && pp->csum_check) ?
 180                                         CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
 181        } else {
 182                /* full checksum calculation */
 183                tcph->check = 0;
 184                skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
 185#ifdef CONFIG_IP_VS_IPV6
 186                if (cp->af == AF_INET6)
 187                        tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
 188                                                      &cp->caddr.in6,
 189                                                      skb->len - tcphoff,
 190                                                      cp->protocol, skb->csum);
 191                else
 192#endif
 193                        tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
 194                                                        cp->caddr.ip,
 195                                                        skb->len - tcphoff,
 196                                                        cp->protocol,
 197                                                        skb->csum);
 198                skb->ip_summed = CHECKSUM_UNNECESSARY;
 199
 200                IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
 201                          pp->name, tcph->check,
 202                          (char*)&(tcph->check) - (char*)tcph);
 203        }
 204        return 1;
 205}
 206
 207
 208static int
 209tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
 210                 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
 211{
 212        struct tcphdr *tcph;
 213        unsigned int tcphoff = iph->len;
 214        int oldlen;
 215        int payload_csum = 0;
 216
 217#ifdef CONFIG_IP_VS_IPV6
 218        if (cp->af == AF_INET6 && iph->fragoffs)
 219                return 1;
 220#endif
 221        oldlen = skb->len - tcphoff;
 222
 223        /* csum_check requires unshared skb */
 224        if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
 225                return 0;
 226
 227        if (unlikely(cp->app != NULL)) {
 228                int ret;
 229
 230                /* Some checks before mangling */
 231                if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
 232                        return 0;
 233
 234                /*
 235                 *      Attempt ip_vs_app call.
 236                 *      It will fix ip_vs_conn and iph ack_seq stuff
 237                 */
 238                if (!(ret = ip_vs_app_pkt_in(cp, skb)))
 239                        return 0;
 240                /* ret=2: csum update is needed after payload mangling */
 241                if (ret == 1)
 242                        oldlen = skb->len - tcphoff;
 243                else
 244                        payload_csum = 1;
 245        }
 246
 247        tcph = (void *)skb_network_header(skb) + tcphoff;
 248        tcph->dest = cp->dport;
 249
 250        /*
 251         *      Adjust TCP checksums
 252         */
 253        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 254                tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
 255                                        htons(oldlen),
 256                                        htons(skb->len - tcphoff));
 257        } else if (!payload_csum) {
 258                /* Only port and addr are changed, do fast csum update */
 259                tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
 260                                     cp->vport, cp->dport);
 261                if (skb->ip_summed == CHECKSUM_COMPLETE)
 262                        skb->ip_summed = (cp->app && pp->csum_check) ?
 263                                         CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
 264        } else {
 265                /* full checksum calculation */
 266                tcph->check = 0;
 267                skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
 268#ifdef CONFIG_IP_VS_IPV6
 269                if (cp->af == AF_INET6)
 270                        tcph->check = csum_ipv6_magic(&cp->caddr.in6,
 271                                                      &cp->daddr.in6,
 272                                                      skb->len - tcphoff,
 273                                                      cp->protocol, skb->csum);
 274                else
 275#endif
 276                        tcph->check = csum_tcpudp_magic(cp->caddr.ip,
 277                                                        cp->daddr.ip,
 278                                                        skb->len - tcphoff,
 279                                                        cp->protocol,
 280                                                        skb->csum);
 281                skb->ip_summed = CHECKSUM_UNNECESSARY;
 282        }
 283        return 1;
 284}
 285
 286
 287static int
 288tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 289{
 290        unsigned int tcphoff;
 291
 292#ifdef CONFIG_IP_VS_IPV6
 293        if (af == AF_INET6)
 294                tcphoff = sizeof(struct ipv6hdr);
 295        else
 296#endif
 297                tcphoff = ip_hdrlen(skb);
 298
 299        switch (skb->ip_summed) {
 300        case CHECKSUM_NONE:
 301                skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
 302        case CHECKSUM_COMPLETE:
 303#ifdef CONFIG_IP_VS_IPV6
 304                if (af == AF_INET6) {
 305                        if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
 306                                            &ipv6_hdr(skb)->daddr,
 307                                            skb->len - tcphoff,
 308                                            ipv6_hdr(skb)->nexthdr,
 309                                            skb->csum)) {
 310                                IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
 311                                                 "Failed checksum for");
 312                                return 0;
 313                        }
 314                } else
 315#endif
 316                        if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
 317                                              ip_hdr(skb)->daddr,
 318                                              skb->len - tcphoff,
 319                                              ip_hdr(skb)->protocol,
 320                                              skb->csum)) {
 321                                IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
 322                                                 "Failed checksum for");
 323                                return 0;
 324                        }
 325                break;
 326        default:
 327                /* No need to checksum. */
 328                break;
 329        }
 330
 331        return 1;
 332}
 333
 334
 335#define TCP_DIR_INPUT           0
 336#define TCP_DIR_OUTPUT          4
 337#define TCP_DIR_INPUT_ONLY      8
 338
 339static const int tcp_state_off[IP_VS_DIR_LAST] = {
 340        [IP_VS_DIR_INPUT]               =       TCP_DIR_INPUT,
 341        [IP_VS_DIR_OUTPUT]              =       TCP_DIR_OUTPUT,
 342        [IP_VS_DIR_INPUT_ONLY]          =       TCP_DIR_INPUT_ONLY,
 343};
 344
 345/*
 346 *      Timeout table[state]
 347 */
 348static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
 349        [IP_VS_TCP_S_NONE]              =       2*HZ,
 350        [IP_VS_TCP_S_ESTABLISHED]       =       15*60*HZ,
 351        [IP_VS_TCP_S_SYN_SENT]          =       2*60*HZ,
 352        [IP_VS_TCP_S_SYN_RECV]          =       1*60*HZ,
 353        [IP_VS_TCP_S_FIN_WAIT]          =       2*60*HZ,
 354        [IP_VS_TCP_S_TIME_WAIT]         =       2*60*HZ,
 355        [IP_VS_TCP_S_CLOSE]             =       10*HZ,
 356        [IP_VS_TCP_S_CLOSE_WAIT]        =       60*HZ,
 357        [IP_VS_TCP_S_LAST_ACK]          =       30*HZ,
 358        [IP_VS_TCP_S_LISTEN]            =       2*60*HZ,
 359        [IP_VS_TCP_S_SYNACK]            =       120*HZ,
 360        [IP_VS_TCP_S_LAST]              =       2*HZ,
 361};
 362
 363static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
 364        [IP_VS_TCP_S_NONE]              =       "NONE",
 365        [IP_VS_TCP_S_ESTABLISHED]       =       "ESTABLISHED",
 366        [IP_VS_TCP_S_SYN_SENT]          =       "SYN_SENT",
 367        [IP_VS_TCP_S_SYN_RECV]          =       "SYN_RECV",
 368        [IP_VS_TCP_S_FIN_WAIT]          =       "FIN_WAIT",
 369        [IP_VS_TCP_S_TIME_WAIT]         =       "TIME_WAIT",
 370        [IP_VS_TCP_S_CLOSE]             =       "CLOSE",
 371        [IP_VS_TCP_S_CLOSE_WAIT]        =       "CLOSE_WAIT",
 372        [IP_VS_TCP_S_LAST_ACK]          =       "LAST_ACK",
 373        [IP_VS_TCP_S_LISTEN]            =       "LISTEN",
 374        [IP_VS_TCP_S_SYNACK]            =       "SYNACK",
 375        [IP_VS_TCP_S_LAST]              =       "BUG!",
 376};
 377
 378#define sNO IP_VS_TCP_S_NONE
 379#define sES IP_VS_TCP_S_ESTABLISHED
 380#define sSS IP_VS_TCP_S_SYN_SENT
 381#define sSR IP_VS_TCP_S_SYN_RECV
 382#define sFW IP_VS_TCP_S_FIN_WAIT
 383#define sTW IP_VS_TCP_S_TIME_WAIT
 384#define sCL IP_VS_TCP_S_CLOSE
 385#define sCW IP_VS_TCP_S_CLOSE_WAIT
 386#define sLA IP_VS_TCP_S_LAST_ACK
 387#define sLI IP_VS_TCP_S_LISTEN
 388#define sSA IP_VS_TCP_S_SYNACK
 389
 390struct tcp_states_t {
 391        int next_state[IP_VS_TCP_S_LAST];
 392};
 393
 394static const char * tcp_state_name(int state)
 395{
 396        if (state >= IP_VS_TCP_S_LAST)
 397                return "ERR!";
 398        return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
 399}
 400
 401static struct tcp_states_t tcp_states [] = {
 402/*      INPUT */
 403/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
 404/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
 405/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
 406/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
 407/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
 408
 409/*      OUTPUT */
 410/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
 411/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
 412/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
 413/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
 414/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
 415
 416/*      INPUT-ONLY */
 417/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
 418/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
 419/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
 420/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
 421/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
 422};
 423
 424static struct tcp_states_t tcp_states_dos [] = {
 425/*      INPUT */
 426/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
 427/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
 428/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
 429/*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
 430/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
 431
 432/*      OUTPUT */
 433/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
 434/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
 435/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
 436/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
 437/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
 438
 439/*      INPUT-ONLY */
 440/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
 441/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
 442/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
 443/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
 444/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
 445};
 446
 447static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
 448{
 449        int on = (flags & 1);           /* secure_tcp */
 450
 451        /*
 452        ** FIXME: change secure_tcp to independent sysctl var
 453        ** or make it per-service or per-app because it is valid
 454        ** for most if not for all of the applications. Something
 455        ** like "capabilities" (flags) for each object.
 456        */
 457        pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
 458}
 459
 460static inline int tcp_state_idx(struct tcphdr *th)
 461{
 462        if (th->rst)
 463                return 3;
 464        if (th->syn)
 465                return 0;
 466        if (th->fin)
 467                return 1;
 468        if (th->ack)
 469                return 2;
 470        return -1;
 471}
 472
 473static inline void
 474set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
 475              int direction, struct tcphdr *th)
 476{
 477        int state_idx;
 478        int new_state = IP_VS_TCP_S_CLOSE;
 479        int state_off = tcp_state_off[direction];
 480
 481        /*
 482         *    Update state offset to INPUT_ONLY if necessary
 483         *    or delete NO_OUTPUT flag if output packet detected
 484         */
 485        if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
 486                if (state_off == TCP_DIR_OUTPUT)
 487                        cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
 488                else
 489                        state_off = TCP_DIR_INPUT_ONLY;
 490        }
 491
 492        if ((state_idx = tcp_state_idx(th)) < 0) {
 493                IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
 494                goto tcp_state_out;
 495        }
 496
 497        new_state =
 498                pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
 499
 500  tcp_state_out:
 501        if (new_state != cp->state) {
 502                struct ip_vs_dest *dest = cp->dest;
 503
 504                IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
 505                              "%s:%d state: %s->%s conn->refcnt:%d\n",
 506                              pd->pp->name,
 507                              ((state_off == TCP_DIR_OUTPUT) ?
 508                               "output " : "input "),
 509                              th->syn ? 'S' : '.',
 510                              th->fin ? 'F' : '.',
 511                              th->ack ? 'A' : '.',
 512                              th->rst ? 'R' : '.',
 513                              IP_VS_DBG_ADDR(cp->af, &cp->daddr),
 514                              ntohs(cp->dport),
 515                              IP_VS_DBG_ADDR(cp->af, &cp->caddr),
 516                              ntohs(cp->cport),
 517                              tcp_state_name(cp->state),
 518                              tcp_state_name(new_state),
 519                              atomic_read(&cp->refcnt));
 520
 521                if (dest) {
 522                        if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
 523                            (new_state != IP_VS_TCP_S_ESTABLISHED)) {
 524                                atomic_dec(&dest->activeconns);
 525                                atomic_inc(&dest->inactconns);
 526                                cp->flags |= IP_VS_CONN_F_INACTIVE;
 527                        } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
 528                                   (new_state == IP_VS_TCP_S_ESTABLISHED)) {
 529                                atomic_inc(&dest->activeconns);
 530                                atomic_dec(&dest->inactconns);
 531                                cp->flags &= ~IP_VS_CONN_F_INACTIVE;
 532                        }
 533                }
 534        }
 535
 536        if (likely(pd))
 537                cp->timeout = pd->timeout_table[cp->state = new_state];
 538        else    /* What to do ? */
 539                cp->timeout = tcp_timeouts[cp->state = new_state];
 540}
 541
 542/*
 543 *      Handle state transitions
 544 */
 545static void
 546tcp_state_transition(struct ip_vs_conn *cp, int direction,
 547                     const struct sk_buff *skb,
 548                     struct ip_vs_proto_data *pd)
 549{
 550        struct tcphdr _tcph, *th;
 551
 552#ifdef CONFIG_IP_VS_IPV6
 553        int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
 554#else
 555        int ihl = ip_hdrlen(skb);
 556#endif
 557
 558        th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
 559        if (th == NULL)
 560                return;
 561
 562        spin_lock_bh(&cp->lock);
 563        set_tcp_state(pd, cp, direction, th);
 564        spin_unlock_bh(&cp->lock);
 565}
 566
 567static inline __u16 tcp_app_hashkey(__be16 port)
 568{
 569        return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
 570                & TCP_APP_TAB_MASK;
 571}
 572
 573
 574static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
 575{
 576        struct ip_vs_app *i;
 577        __u16 hash;
 578        __be16 port = inc->port;
 579        int ret = 0;
 580        struct netns_ipvs *ipvs = net_ipvs(net);
 581        struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
 582
 583        hash = tcp_app_hashkey(port);
 584
 585        list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
 586                if (i->port == port) {
 587                        ret = -EEXIST;
 588                        goto out;
 589                }
 590        }
 591        list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]);
 592        atomic_inc(&pd->appcnt);
 593
 594  out:
 595        return ret;
 596}
 597
 598
 599static void
 600tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
 601{
 602        struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
 603
 604        atomic_dec(&pd->appcnt);
 605        list_del_rcu(&inc->p_list);
 606}
 607
 608
 609static int
 610tcp_app_conn_bind(struct ip_vs_conn *cp)
 611{
 612        struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
 613        int hash;
 614        struct ip_vs_app *inc;
 615        int result = 0;
 616
 617        /* Default binding: bind app only for NAT */
 618        if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
 619                return 0;
 620
 621        /* Lookup application incarnations and bind the right one */
 622        hash = tcp_app_hashkey(cp->vport);
 623
 624        rcu_read_lock();
 625        list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) {
 626                if (inc->port == cp->vport) {
 627                        if (unlikely(!ip_vs_app_inc_get(inc)))
 628                                break;
 629                        rcu_read_unlock();
 630
 631                        IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
 632                                      "%s:%u to app %s on port %u\n",
 633                                      __func__,
 634                                      IP_VS_DBG_ADDR(cp->af, &cp->caddr),
 635                                      ntohs(cp->cport),
 636                                      IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
 637                                      ntohs(cp->vport),
 638                                      inc->name, ntohs(inc->port));
 639
 640                        cp->app = inc;
 641                        if (inc->init_conn)
 642                                result = inc->init_conn(inc, cp);
 643                        goto out;
 644                }
 645        }
 646        rcu_read_unlock();
 647
 648  out:
 649        return result;
 650}
 651
 652
 653/*
 654 *      Set LISTEN timeout. (ip_vs_conn_put will setup timer)
 655 */
 656void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
 657{
 658        struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
 659
 660        spin_lock_bh(&cp->lock);
 661        cp->state = IP_VS_TCP_S_LISTEN;
 662        cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
 663                           : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
 664        spin_unlock_bh(&cp->lock);
 665}
 666
 667/* ---------------------------------------------
 668 *   timeouts is netns related now.
 669 * ---------------------------------------------
 670 */
 671static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
 672{
 673        struct netns_ipvs *ipvs = net_ipvs(net);
 674
 675        ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
 676        pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
 677                                                        sizeof(tcp_timeouts));
 678        if (!pd->timeout_table)
 679                return -ENOMEM;
 680        pd->tcp_state_table =  tcp_states;
 681        return 0;
 682}
 683
 684static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
 685{
 686        kfree(pd->timeout_table);
 687}
 688
 689
 690struct ip_vs_protocol ip_vs_protocol_tcp = {
 691        .name =                 "TCP",
 692        .protocol =             IPPROTO_TCP,
 693        .num_states =           IP_VS_TCP_S_LAST,
 694        .dont_defrag =          0,
 695        .init =                 NULL,
 696        .exit =                 NULL,
 697        .init_netns =           __ip_vs_tcp_init,
 698        .exit_netns =           __ip_vs_tcp_exit,
 699        .register_app =         tcp_register_app,
 700        .unregister_app =       tcp_unregister_app,
 701        .conn_schedule =        tcp_conn_schedule,
 702        .conn_in_get =          ip_vs_conn_in_get_proto,
 703        .conn_out_get =         ip_vs_conn_out_get_proto,
 704        .snat_handler =         tcp_snat_handler,
 705        .dnat_handler =         tcp_dnat_handler,
 706        .csum_check =           tcp_csum_check,
 707        .state_name =           tcp_state_name,
 708        .state_transition =     tcp_state_transition,
 709        .app_conn_bind =        tcp_app_conn_bind,
 710        .debug_packet =         ip_vs_tcpudp_debug_packet,
 711        .timeout_change =       tcp_timeout_change,
 712};
 713