linux/net/netfilter/ipvs/ip_vs_proto_tcp.c
<<
>>
Prefs
   1/*
   2 * ip_vs_proto_tcp.c:   TCP load balancing support for IPVS
   3 *
   4 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
   5 *              Julian Anastasov <ja@ssi.bg>
   6 *
   7 *              This program is free software; you can redistribute it and/or
   8 *              modify it under the terms of the GNU General Public License
   9 *              as published by the Free Software Foundation; either version
  10 *              2 of the License, or (at your option) any later version.
  11 *
  12 * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>
  13 *
  14 *              Network name space (netns) aware.
  15 *              Global data moved to netns i.e struct netns_ipvs
  16 *              tcp_timeouts table has copy per netns in a hash table per
  17 *              protocol ip_vs_proto_data and is handled by netns
  18 */
  19
  20#define KMSG_COMPONENT "IPVS"
  21#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  22
  23#include <linux/kernel.h>
  24#include <linux/ip.h>
  25#include <linux/tcp.h>                  /* for tcphdr */
  26#include <net/ip.h>
  27#include <net/tcp.h>                    /* for csum_tcpudp_magic */
  28#include <net/ip6_checksum.h>
  29#include <linux/netfilter.h>
  30#include <linux/netfilter_ipv4.h>
  31
  32#include <net/ip_vs.h>
  33
  34static int
  35tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
  36                  struct ip_vs_proto_data *pd,
  37                  int *verdict, struct ip_vs_conn **cpp,
  38                  struct ip_vs_iphdr *iph)
  39{
  40        struct ip_vs_service *svc;
  41        struct tcphdr _tcph, *th;
  42        __be16 _ports[2], *ports = NULL;
  43
  44        /* In the event of icmp, we're only guaranteed to have the first 8
  45         * bytes of the transport header, so we only check the rest of the
  46         * TCP packet for non-ICMP packets
  47         */
  48        if (likely(!ip_vs_iph_icmp(iph))) {
  49                th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
  50                if (th) {
  51                        if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn))
  52                                return 1;
  53                        ports = &th->source;
  54                }
  55        } else {
  56                ports = skb_header_pointer(
  57                        skb, iph->len, sizeof(_ports), &_ports);
  58        }
  59
  60        if (!ports) {
  61                *verdict = NF_DROP;
  62                return 0;
  63        }
  64
  65        /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
  66        rcu_read_lock();
  67
  68        if (likely(!ip_vs_iph_inverse(iph)))
  69                svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
  70                                         &iph->daddr, ports[1]);
  71        else
  72                svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
  73                                         &iph->saddr, ports[0]);
  74
  75        if (svc) {
  76                int ignored;
  77
  78                if (ip_vs_todrop(ipvs)) {
  79                        /*
  80                         * It seems that we are very loaded.
  81                         * We have to drop this packet :(
  82                         */
  83                        rcu_read_unlock();
  84                        *verdict = NF_DROP;
  85                        return 0;
  86                }
  87
  88                /*
  89                 * Let the virtual server select a real server for the
  90                 * incoming connection, and create a connection entry.
  91                 */
  92                *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
  93                if (!*cpp && ignored <= 0) {
  94                        if (!ignored)
  95                                *verdict = ip_vs_leave(svc, skb, pd, iph);
  96                        else
  97                                *verdict = NF_DROP;
  98                        rcu_read_unlock();
  99                        return 0;
 100                }
 101        }
 102        rcu_read_unlock();
 103        /* NF_ACCEPT */
 104        return 1;
 105}
 106
 107
 108static inline void
 109tcp_fast_csum_update(int af, struct tcphdr *tcph,
 110                     const union nf_inet_addr *oldip,
 111                     const union nf_inet_addr *newip,
 112                     __be16 oldport, __be16 newport)
 113{
 114#ifdef CONFIG_IP_VS_IPV6
 115        if (af == AF_INET6)
 116                tcph->check =
 117                        csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
 118                                         ip_vs_check_diff2(oldport, newport,
 119                                                ~csum_unfold(tcph->check))));
 120        else
 121#endif
 122        tcph->check =
 123                csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
 124                                 ip_vs_check_diff2(oldport, newport,
 125                                                ~csum_unfold(tcph->check))));
 126}
 127
 128
 129static inline void
 130tcp_partial_csum_update(int af, struct tcphdr *tcph,
 131                     const union nf_inet_addr *oldip,
 132                     const union nf_inet_addr *newip,
 133                     __be16 oldlen, __be16 newlen)
 134{
 135#ifdef CONFIG_IP_VS_IPV6
 136        if (af == AF_INET6)
 137                tcph->check =
 138                        ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
 139                                         ip_vs_check_diff2(oldlen, newlen,
 140                                                csum_unfold(tcph->check))));
 141        else
 142#endif
 143        tcph->check =
 144                ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
 145                                ip_vs_check_diff2(oldlen, newlen,
 146                                                csum_unfold(tcph->check))));
 147}
 148
 149
 150static int
 151tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
 152                 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
 153{
 154        struct tcphdr *tcph;
 155        unsigned int tcphoff = iph->len;
 156        int oldlen;
 157        int payload_csum = 0;
 158
 159#ifdef CONFIG_IP_VS_IPV6
 160        if (cp->af == AF_INET6 && iph->fragoffs)
 161                return 1;
 162#endif
 163        oldlen = skb->len - tcphoff;
 164
 165        /* csum_check requires unshared skb */
 166        if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
 167                return 0;
 168
 169        if (unlikely(cp->app != NULL)) {
 170                int ret;
 171
 172                /* Some checks before mangling */
 173                if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
 174                        return 0;
 175
 176                /* Call application helper if needed */
 177                if (!(ret = ip_vs_app_pkt_out(cp, skb)))
 178                        return 0;
 179                /* ret=2: csum update is needed after payload mangling */
 180                if (ret == 1)
 181                        oldlen = skb->len - tcphoff;
 182                else
 183                        payload_csum = 1;
 184        }
 185
 186        tcph = (void *)skb_network_header(skb) + tcphoff;
 187        tcph->source = cp->vport;
 188
 189        /* Adjust TCP checksums */
 190        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 191                tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
 192                                        htons(oldlen),
 193                                        htons(skb->len - tcphoff));
 194        } else if (!payload_csum) {
 195                /* Only port and addr are changed, do fast csum update */
 196                tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
 197                                     cp->dport, cp->vport);
 198                if (skb->ip_summed == CHECKSUM_COMPLETE)
 199                        skb->ip_summed = (cp->app && pp->csum_check) ?
 200                                         CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
 201        } else {
 202                /* full checksum calculation */
 203                tcph->check = 0;
 204                skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
 205#ifdef CONFIG_IP_VS_IPV6
 206                if (cp->af == AF_INET6)
 207                        tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
 208                                                      &cp->caddr.in6,
 209                                                      skb->len - tcphoff,
 210                                                      cp->protocol, skb->csum);
 211                else
 212#endif
 213                        tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
 214                                                        cp->caddr.ip,
 215                                                        skb->len - tcphoff,
 216                                                        cp->protocol,
 217                                                        skb->csum);
 218                skb->ip_summed = CHECKSUM_UNNECESSARY;
 219
 220                IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
 221                          pp->name, tcph->check,
 222                          (char*)&(tcph->check) - (char*)tcph);
 223        }
 224        return 1;
 225}
 226
 227
 228static int
 229tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
 230                 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
 231{
 232        struct tcphdr *tcph;
 233        unsigned int tcphoff = iph->len;
 234        int oldlen;
 235        int payload_csum = 0;
 236
 237#ifdef CONFIG_IP_VS_IPV6
 238        if (cp->af == AF_INET6 && iph->fragoffs)
 239                return 1;
 240#endif
 241        oldlen = skb->len - tcphoff;
 242
 243        /* csum_check requires unshared skb */
 244        if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
 245                return 0;
 246
 247        if (unlikely(cp->app != NULL)) {
 248                int ret;
 249
 250                /* Some checks before mangling */
 251                if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
 252                        return 0;
 253
 254                /*
 255                 *      Attempt ip_vs_app call.
 256                 *      It will fix ip_vs_conn and iph ack_seq stuff
 257                 */
 258                if (!(ret = ip_vs_app_pkt_in(cp, skb)))
 259                        return 0;
 260                /* ret=2: csum update is needed after payload mangling */
 261                if (ret == 1)
 262                        oldlen = skb->len - tcphoff;
 263                else
 264                        payload_csum = 1;
 265        }
 266
 267        tcph = (void *)skb_network_header(skb) + tcphoff;
 268        tcph->dest = cp->dport;
 269
 270        /*
 271         *      Adjust TCP checksums
 272         */
 273        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 274                tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
 275                                        htons(oldlen),
 276                                        htons(skb->len - tcphoff));
 277        } else if (!payload_csum) {
 278                /* Only port and addr are changed, do fast csum update */
 279                tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
 280                                     cp->vport, cp->dport);
 281                if (skb->ip_summed == CHECKSUM_COMPLETE)
 282                        skb->ip_summed = (cp->app && pp->csum_check) ?
 283                                         CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
 284        } else {
 285                /* full checksum calculation */
 286                tcph->check = 0;
 287                skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
 288#ifdef CONFIG_IP_VS_IPV6
 289                if (cp->af == AF_INET6)
 290                        tcph->check = csum_ipv6_magic(&cp->caddr.in6,
 291                                                      &cp->daddr.in6,
 292                                                      skb->len - tcphoff,
 293                                                      cp->protocol, skb->csum);
 294                else
 295#endif
 296                        tcph->check = csum_tcpudp_magic(cp->caddr.ip,
 297                                                        cp->daddr.ip,
 298                                                        skb->len - tcphoff,
 299                                                        cp->protocol,
 300                                                        skb->csum);
 301                skb->ip_summed = CHECKSUM_UNNECESSARY;
 302        }
 303        return 1;
 304}
 305
 306
 307static int
 308tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 309{
 310        unsigned int tcphoff;
 311
 312#ifdef CONFIG_IP_VS_IPV6
 313        if (af == AF_INET6)
 314                tcphoff = sizeof(struct ipv6hdr);
 315        else
 316#endif
 317                tcphoff = ip_hdrlen(skb);
 318
 319        switch (skb->ip_summed) {
 320        case CHECKSUM_NONE:
 321                skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
 322        case CHECKSUM_COMPLETE:
 323#ifdef CONFIG_IP_VS_IPV6
 324                if (af == AF_INET6) {
 325                        if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
 326                                            &ipv6_hdr(skb)->daddr,
 327                                            skb->len - tcphoff,
 328                                            ipv6_hdr(skb)->nexthdr,
 329                                            skb->csum)) {
 330                                IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
 331                                                 "Failed checksum for");
 332                                return 0;
 333                        }
 334                } else
 335#endif
 336                        if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
 337                                              ip_hdr(skb)->daddr,
 338                                              skb->len - tcphoff,
 339                                              ip_hdr(skb)->protocol,
 340                                              skb->csum)) {
 341                                IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
 342                                                 "Failed checksum for");
 343                                return 0;
 344                        }
 345                break;
 346        default:
 347                /* No need to checksum. */
 348                break;
 349        }
 350
 351        return 1;
 352}
 353
 354
 355#define TCP_DIR_INPUT           0
 356#define TCP_DIR_OUTPUT          4
 357#define TCP_DIR_INPUT_ONLY      8
 358
 359static const int tcp_state_off[IP_VS_DIR_LAST] = {
 360        [IP_VS_DIR_INPUT]               =       TCP_DIR_INPUT,
 361        [IP_VS_DIR_OUTPUT]              =       TCP_DIR_OUTPUT,
 362        [IP_VS_DIR_INPUT_ONLY]          =       TCP_DIR_INPUT_ONLY,
 363};
 364
 365/*
 366 *      Timeout table[state]
 367 */
 368static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
 369        [IP_VS_TCP_S_NONE]              =       2*HZ,
 370        [IP_VS_TCP_S_ESTABLISHED]       =       15*60*HZ,
 371        [IP_VS_TCP_S_SYN_SENT]          =       2*60*HZ,
 372        [IP_VS_TCP_S_SYN_RECV]          =       1*60*HZ,
 373        [IP_VS_TCP_S_FIN_WAIT]          =       2*60*HZ,
 374        [IP_VS_TCP_S_TIME_WAIT]         =       2*60*HZ,
 375        [IP_VS_TCP_S_CLOSE]             =       10*HZ,
 376        [IP_VS_TCP_S_CLOSE_WAIT]        =       60*HZ,
 377        [IP_VS_TCP_S_LAST_ACK]          =       30*HZ,
 378        [IP_VS_TCP_S_LISTEN]            =       2*60*HZ,
 379        [IP_VS_TCP_S_SYNACK]            =       120*HZ,
 380        [IP_VS_TCP_S_LAST]              =       2*HZ,
 381};
 382
 383static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
 384        [IP_VS_TCP_S_NONE]              =       "NONE",
 385        [IP_VS_TCP_S_ESTABLISHED]       =       "ESTABLISHED",
 386        [IP_VS_TCP_S_SYN_SENT]          =       "SYN_SENT",
 387        [IP_VS_TCP_S_SYN_RECV]          =       "SYN_RECV",
 388        [IP_VS_TCP_S_FIN_WAIT]          =       "FIN_WAIT",
 389        [IP_VS_TCP_S_TIME_WAIT]         =       "TIME_WAIT",
 390        [IP_VS_TCP_S_CLOSE]             =       "CLOSE",
 391        [IP_VS_TCP_S_CLOSE_WAIT]        =       "CLOSE_WAIT",
 392        [IP_VS_TCP_S_LAST_ACK]          =       "LAST_ACK",
 393        [IP_VS_TCP_S_LISTEN]            =       "LISTEN",
 394        [IP_VS_TCP_S_SYNACK]            =       "SYNACK",
 395        [IP_VS_TCP_S_LAST]              =       "BUG!",
 396};
 397
 398#define sNO IP_VS_TCP_S_NONE
 399#define sES IP_VS_TCP_S_ESTABLISHED
 400#define sSS IP_VS_TCP_S_SYN_SENT
 401#define sSR IP_VS_TCP_S_SYN_RECV
 402#define sFW IP_VS_TCP_S_FIN_WAIT
 403#define sTW IP_VS_TCP_S_TIME_WAIT
 404#define sCL IP_VS_TCP_S_CLOSE
 405#define sCW IP_VS_TCP_S_CLOSE_WAIT
 406#define sLA IP_VS_TCP_S_LAST_ACK
 407#define sLI IP_VS_TCP_S_LISTEN
 408#define sSA IP_VS_TCP_S_SYNACK
 409
 410struct tcp_states_t {
 411        int next_state[IP_VS_TCP_S_LAST];
 412};
 413
 414static const char * tcp_state_name(int state)
 415{
 416        if (state >= IP_VS_TCP_S_LAST)
 417                return "ERR!";
 418        return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
 419}
 420
 421static struct tcp_states_t tcp_states [] = {
 422/*      INPUT */
 423/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
 424/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
 425/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
 426/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
 427/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
 428
 429/*      OUTPUT */
 430/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
 431/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
 432/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
 433/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
 434/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
 435
 436/*      INPUT-ONLY */
 437/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
 438/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
 439/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
 440/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
 441/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
 442};
 443
 444static struct tcp_states_t tcp_states_dos [] = {
 445/*      INPUT */
 446/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
 447/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
 448/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
 449/*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
 450/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
 451
 452/*      OUTPUT */
 453/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
 454/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
 455/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
 456/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
 457/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
 458
 459/*      INPUT-ONLY */
 460/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
 461/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
 462/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
 463/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
 464/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
 465};
 466
 467static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
 468{
 469        int on = (flags & 1);           /* secure_tcp */
 470
 471        /*
 472        ** FIXME: change secure_tcp to independent sysctl var
 473        ** or make it per-service or per-app because it is valid
 474        ** for most if not for all of the applications. Something
 475        ** like "capabilities" (flags) for each object.
 476        */
 477        pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
 478}
 479
 480static inline int tcp_state_idx(struct tcphdr *th)
 481{
 482        if (th->rst)
 483                return 3;
 484        if (th->syn)
 485                return 0;
 486        if (th->fin)
 487                return 1;
 488        if (th->ack)
 489                return 2;
 490        return -1;
 491}
 492
 493static inline void
 494set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
 495              int direction, struct tcphdr *th)
 496{
 497        int state_idx;
 498        int new_state = IP_VS_TCP_S_CLOSE;
 499        int state_off = tcp_state_off[direction];
 500
 501        /*
 502         *    Update state offset to INPUT_ONLY if necessary
 503         *    or delete NO_OUTPUT flag if output packet detected
 504         */
 505        if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
 506                if (state_off == TCP_DIR_OUTPUT)
 507                        cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
 508                else
 509                        state_off = TCP_DIR_INPUT_ONLY;
 510        }
 511
 512        if ((state_idx = tcp_state_idx(th)) < 0) {
 513                IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
 514                goto tcp_state_out;
 515        }
 516
 517        new_state =
 518                pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
 519
 520  tcp_state_out:
 521        if (new_state != cp->state) {
 522                struct ip_vs_dest *dest = cp->dest;
 523
 524                IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
 525                              "%s:%d state: %s->%s conn->refcnt:%d\n",
 526                              pd->pp->name,
 527                              ((state_off == TCP_DIR_OUTPUT) ?
 528                               "output " : "input "),
 529                              th->syn ? 'S' : '.',
 530                              th->fin ? 'F' : '.',
 531                              th->ack ? 'A' : '.',
 532                              th->rst ? 'R' : '.',
 533                              IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
 534                              ntohs(cp->dport),
 535                              IP_VS_DBG_ADDR(cp->af, &cp->caddr),
 536                              ntohs(cp->cport),
 537                              tcp_state_name(cp->state),
 538                              tcp_state_name(new_state),
 539                              atomic_read(&cp->refcnt));
 540
 541                if (dest) {
 542                        if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
 543                            (new_state != IP_VS_TCP_S_ESTABLISHED)) {
 544                                atomic_dec(&dest->activeconns);
 545                                atomic_inc(&dest->inactconns);
 546                                cp->flags |= IP_VS_CONN_F_INACTIVE;
 547                        } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
 548                                   (new_state == IP_VS_TCP_S_ESTABLISHED)) {
 549                                atomic_inc(&dest->activeconns);
 550                                atomic_dec(&dest->inactconns);
 551                                cp->flags &= ~IP_VS_CONN_F_INACTIVE;
 552                        }
 553                }
 554        }
 555
 556        if (likely(pd))
 557                cp->timeout = pd->timeout_table[cp->state = new_state];
 558        else    /* What to do ? */
 559                cp->timeout = tcp_timeouts[cp->state = new_state];
 560}
 561
 562/*
 563 *      Handle state transitions
 564 */
 565static void
 566tcp_state_transition(struct ip_vs_conn *cp, int direction,
 567                     const struct sk_buff *skb,
 568                     struct ip_vs_proto_data *pd)
 569{
 570        struct tcphdr _tcph, *th;
 571
 572#ifdef CONFIG_IP_VS_IPV6
 573        int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
 574#else
 575        int ihl = ip_hdrlen(skb);
 576#endif
 577
 578        th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
 579        if (th == NULL)
 580                return;
 581
 582        spin_lock_bh(&cp->lock);
 583        set_tcp_state(pd, cp, direction, th);
 584        spin_unlock_bh(&cp->lock);
 585}
 586
 587static inline __u16 tcp_app_hashkey(__be16 port)
 588{
 589        return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
 590                & TCP_APP_TAB_MASK;
 591}
 592
 593
 594static int tcp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
 595{
 596        struct ip_vs_app *i;
 597        __u16 hash;
 598        __be16 port = inc->port;
 599        int ret = 0;
 600        struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
 601
 602        hash = tcp_app_hashkey(port);
 603
 604        list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
 605                if (i->port == port) {
 606                        ret = -EEXIST;
 607                        goto out;
 608                }
 609        }
 610        list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]);
 611        atomic_inc(&pd->appcnt);
 612
 613  out:
 614        return ret;
 615}
 616
 617
 618static void
 619tcp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
 620{
 621        struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
 622
 623        atomic_dec(&pd->appcnt);
 624        list_del_rcu(&inc->p_list);
 625}
 626
 627
 628static int
 629tcp_app_conn_bind(struct ip_vs_conn *cp)
 630{
 631        struct netns_ipvs *ipvs = cp->ipvs;
 632        int hash;
 633        struct ip_vs_app *inc;
 634        int result = 0;
 635
 636        /* Default binding: bind app only for NAT */
 637        if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
 638                return 0;
 639
 640        /* Lookup application incarnations and bind the right one */
 641        hash = tcp_app_hashkey(cp->vport);
 642
 643        rcu_read_lock();
 644        list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) {
 645                if (inc->port == cp->vport) {
 646                        if (unlikely(!ip_vs_app_inc_get(inc)))
 647                                break;
 648                        rcu_read_unlock();
 649
 650                        IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
 651                                      "%s:%u to app %s on port %u\n",
 652                                      __func__,
 653                                      IP_VS_DBG_ADDR(cp->af, &cp->caddr),
 654                                      ntohs(cp->cport),
 655                                      IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
 656                                      ntohs(cp->vport),
 657                                      inc->name, ntohs(inc->port));
 658
 659                        cp->app = inc;
 660                        if (inc->init_conn)
 661                                result = inc->init_conn(inc, cp);
 662                        goto out;
 663                }
 664        }
 665        rcu_read_unlock();
 666
 667  out:
 668        return result;
 669}
 670
 671
 672/*
 673 *      Set LISTEN timeout. (ip_vs_conn_put will setup timer)
 674 */
 675void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
 676{
 677        struct ip_vs_proto_data *pd = ip_vs_proto_data_get(cp->ipvs, IPPROTO_TCP);
 678
 679        spin_lock_bh(&cp->lock);
 680        cp->state = IP_VS_TCP_S_LISTEN;
 681        cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
 682                           : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
 683        spin_unlock_bh(&cp->lock);
 684}
 685
 686/* ---------------------------------------------
 687 *   timeouts is netns related now.
 688 * ---------------------------------------------
 689 */
 690static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
 691{
 692        ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
 693        pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
 694                                                        sizeof(tcp_timeouts));
 695        if (!pd->timeout_table)
 696                return -ENOMEM;
 697        pd->tcp_state_table =  tcp_states;
 698        return 0;
 699}
 700
 701static void __ip_vs_tcp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
 702{
 703        kfree(pd->timeout_table);
 704}
 705
 706
 707struct ip_vs_protocol ip_vs_protocol_tcp = {
 708        .name =                 "TCP",
 709        .protocol =             IPPROTO_TCP,
 710        .num_states =           IP_VS_TCP_S_LAST,
 711        .dont_defrag =          0,
 712        .init =                 NULL,
 713        .exit =                 NULL,
 714        .init_netns =           __ip_vs_tcp_init,
 715        .exit_netns =           __ip_vs_tcp_exit,
 716        .register_app =         tcp_register_app,
 717        .unregister_app =       tcp_unregister_app,
 718        .conn_schedule =        tcp_conn_schedule,
 719        .conn_in_get =          ip_vs_conn_in_get_proto,
 720        .conn_out_get =         ip_vs_conn_out_get_proto,
 721        .snat_handler =         tcp_snat_handler,
 722        .dnat_handler =         tcp_dnat_handler,
 723        .csum_check =           tcp_csum_check,
 724        .state_name =           tcp_state_name,
 725        .state_transition =     tcp_state_transition,
 726        .app_conn_bind =        tcp_app_conn_bind,
 727        .debug_packet =         ip_vs_tcpudp_debug_packet,
 728        .timeout_change =       tcp_timeout_change,
 729};
 730