linux/net/netfilter/nf_conntrack_proto_tcp.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/* (C) 1999-2001 Paul `Rusty' Russell
   3 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
   4 * (C) 2002-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
   5 * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
   6 */
   7
   8#include <linux/types.h>
   9#include <linux/timer.h>
  10#include <linux/module.h>
  11#include <linux/in.h>
  12#include <linux/tcp.h>
  13#include <linux/spinlock.h>
  14#include <linux/skbuff.h>
  15#include <linux/ipv6.h>
  16#include <net/ip6_checksum.h>
  17#include <asm/unaligned.h>
  18
  19#include <net/tcp.h>
  20
  21#include <linux/netfilter.h>
  22#include <linux/netfilter_ipv4.h>
  23#include <linux/netfilter_ipv6.h>
  24#include <net/netfilter/nf_conntrack.h>
  25#include <net/netfilter/nf_conntrack_l4proto.h>
  26#include <net/netfilter/nf_conntrack_ecache.h>
  27#include <net/netfilter/nf_conntrack_seqadj.h>
  28#include <net/netfilter/nf_conntrack_synproxy.h>
  29#include <net/netfilter/nf_conntrack_timeout.h>
  30#include <net/netfilter/nf_log.h>
  31#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
  32#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
  33
  34/* "Be conservative in what you do,
  35    be liberal in what you accept from others."
  36    If it's non-zero, we mark only out of window RST segments as INVALID. */
  37static int nf_ct_tcp_be_liberal __read_mostly = 0;
  38
  39/* If it is set to zero, we disable picking up already established
  40   connections. */
  41static int nf_ct_tcp_loose __read_mostly = 1;
  42
  43/* Max number of the retransmitted packets without receiving an (acceptable)
  44   ACK from the destination. If this number is reached, a shorter timer
  45   will be started. */
  46static int nf_ct_tcp_max_retrans __read_mostly = 3;
  47
  48  /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
  49     closely.  They're more complex. --RR */
  50
  51static const char *const tcp_conntrack_names[] = {
  52        "NONE",
  53        "SYN_SENT",
  54        "SYN_RECV",
  55        "ESTABLISHED",
  56        "FIN_WAIT",
  57        "CLOSE_WAIT",
  58        "LAST_ACK",
  59        "TIME_WAIT",
  60        "CLOSE",
  61        "SYN_SENT2",
  62};
  63
  64#define SECS * HZ
  65#define MINS * 60 SECS
  66#define HOURS * 60 MINS
  67#define DAYS * 24 HOURS
  68
  69static const unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] = {
  70        [TCP_CONNTRACK_SYN_SENT]        = 2 MINS,
  71        [TCP_CONNTRACK_SYN_RECV]        = 60 SECS,
  72        [TCP_CONNTRACK_ESTABLISHED]     = 5 DAYS,
  73        [TCP_CONNTRACK_FIN_WAIT]        = 2 MINS,
  74        [TCP_CONNTRACK_CLOSE_WAIT]      = 60 SECS,
  75        [TCP_CONNTRACK_LAST_ACK]        = 30 SECS,
  76        [TCP_CONNTRACK_TIME_WAIT]       = 2 MINS,
  77        [TCP_CONNTRACK_CLOSE]           = 10 SECS,
  78        [TCP_CONNTRACK_SYN_SENT2]       = 2 MINS,
  79/* RFC1122 says the R2 limit should be at least 100 seconds.
  80   Linux uses 15 packets as limit, which corresponds
  81   to ~13-30min depending on RTO. */
  82        [TCP_CONNTRACK_RETRANS]         = 5 MINS,
  83        [TCP_CONNTRACK_UNACK]           = 5 MINS,
  84};
  85
  86#define sNO TCP_CONNTRACK_NONE
  87#define sSS TCP_CONNTRACK_SYN_SENT
  88#define sSR TCP_CONNTRACK_SYN_RECV
  89#define sES TCP_CONNTRACK_ESTABLISHED
  90#define sFW TCP_CONNTRACK_FIN_WAIT
  91#define sCW TCP_CONNTRACK_CLOSE_WAIT
  92#define sLA TCP_CONNTRACK_LAST_ACK
  93#define sTW TCP_CONNTRACK_TIME_WAIT
  94#define sCL TCP_CONNTRACK_CLOSE
  95#define sS2 TCP_CONNTRACK_SYN_SENT2
  96#define sIV TCP_CONNTRACK_MAX
  97#define sIG TCP_CONNTRACK_IGNORE
  98
  99/* What TCP flags are set from RST/SYN/FIN/ACK. */
 100enum tcp_bit_set {
 101        TCP_SYN_SET,
 102        TCP_SYNACK_SET,
 103        TCP_FIN_SET,
 104        TCP_ACK_SET,
 105        TCP_RST_SET,
 106        TCP_NONE_SET,
 107};
 108
 109/*
 110 * The TCP state transition table needs a few words...
 111 *
 112 * We are the man in the middle. All the packets go through us
 113 * but might get lost in transit to the destination.
 114 * It is assumed that the destinations can't receive segments
 115 * we haven't seen.
 116 *
 117 * The checked segment is in window, but our windows are *not*
 118 * equivalent with the ones of the sender/receiver. We always
 119 * try to guess the state of the current sender.
 120 *
 121 * The meaning of the states are:
 122 *
 123 * NONE:        initial state
 124 * SYN_SENT:    SYN-only packet seen
 125 * SYN_SENT2:   SYN-only packet seen from reply dir, simultaneous open
 126 * SYN_RECV:    SYN-ACK packet seen
 127 * ESTABLISHED: ACK packet seen
 128 * FIN_WAIT:    FIN packet seen
 129 * CLOSE_WAIT:  ACK seen (after FIN)
 130 * LAST_ACK:    FIN seen (after FIN)
 131 * TIME_WAIT:   last ACK seen
 132 * CLOSE:       closed connection (RST)
 133 *
 134 * Packets marked as IGNORED (sIG):
 135 *      if they may be either invalid or valid
 136 *      and the receiver may send back a connection
 137 *      closing RST or a SYN/ACK.
 138 *
 139 * Packets marked as INVALID (sIV):
 140 *      if we regard them as truly invalid packets
 141 */
 142static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
 143        {
 144/* ORIGINAL */
 145/*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
 146/*syn*/    { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 },
 147/*
 148 *      sNO -> sSS      Initialize a new connection
 149 *      sSS -> sSS      Retransmitted SYN
 150 *      sS2 -> sS2      Late retransmitted SYN
 151 *      sSR -> sIG
 152 *      sES -> sIG      Error: SYNs in window outside the SYN_SENT state
 153 *                      are errors. Receiver will reply with RST
 154 *                      and close the connection.
 155 *                      Or we are not in sync and hold a dead connection.
 156 *      sFW -> sIG
 157 *      sCW -> sIG
 158 *      sLA -> sIG
 159 *      sTW -> sSS      Reopened connection (RFC 1122).
 160 *      sCL -> sSS
 161 */
 162/*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
 163/*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR },
 164/*
 165 *      sNO -> sIV      Too late and no reason to do anything
 166 *      sSS -> sIV      Client can't send SYN and then SYN/ACK
 167 *      sS2 -> sSR      SYN/ACK sent to SYN2 in simultaneous open
 168 *      sSR -> sSR      Late retransmitted SYN/ACK in simultaneous open
 169 *      sES -> sIV      Invalid SYN/ACK packets sent by the client
 170 *      sFW -> sIV
 171 *      sCW -> sIV
 172 *      sLA -> sIV
 173 *      sTW -> sIV
 174 *      sCL -> sIV
 175 */
 176/*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
 177/*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
 178/*
 179 *      sNO -> sIV      Too late and no reason to do anything...
 180 *      sSS -> sIV      Client migth not send FIN in this state:
 181 *                      we enforce waiting for a SYN/ACK reply first.
 182 *      sS2 -> sIV
 183 *      sSR -> sFW      Close started.
 184 *      sES -> sFW
 185 *      sFW -> sLA      FIN seen in both directions, waiting for
 186 *                      the last ACK.
 187 *                      Migth be a retransmitted FIN as well...
 188 *      sCW -> sLA
 189 *      sLA -> sLA      Retransmitted FIN. Remain in the same state.
 190 *      sTW -> sTW
 191 *      sCL -> sCL
 192 */
 193/*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
 194/*ack*/    { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
 195/*
 196 *      sNO -> sES      Assumed.
 197 *      sSS -> sIV      ACK is invalid: we haven't seen a SYN/ACK yet.
 198 *      sS2 -> sIV
 199 *      sSR -> sES      Established state is reached.
 200 *      sES -> sES      :-)
 201 *      sFW -> sCW      Normal close request answered by ACK.
 202 *      sCW -> sCW
 203 *      sLA -> sTW      Last ACK detected (RFC5961 challenged)
 204 *      sTW -> sTW      Retransmitted last ACK. Remain in the same state.
 205 *      sCL -> sCL
 206 */
 207/*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
 208/*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
 209/*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
 210        },
 211        {
 212/* REPLY */
 213/*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
 214/*syn*/    { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sSS, sIV, sS2 },
 215/*
 216 *      sNO -> sIV      Never reached.
 217 *      sSS -> sS2      Simultaneous open
 218 *      sS2 -> sS2      Retransmitted simultaneous SYN
 219 *      sSR -> sIV      Invalid SYN packets sent by the server
 220 *      sES -> sIV
 221 *      sFW -> sIV
 222 *      sCW -> sIV
 223 *      sLA -> sIV
 224 *      sTW -> sSS      Reopened connection, but server may have switched role
 225 *      sCL -> sIV
 226 */
 227/*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
 228/*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
 229/*
 230 *      sSS -> sSR      Standard open.
 231 *      sS2 -> sSR      Simultaneous open
 232 *      sSR -> sIG      Retransmitted SYN/ACK, ignore it.
 233 *      sES -> sIG      Late retransmitted SYN/ACK?
 234 *      sFW -> sIG      Might be SYN/ACK answering ignored SYN
 235 *      sCW -> sIG
 236 *      sLA -> sIG
 237 *      sTW -> sIG
 238 *      sCL -> sIG
 239 */
 240/*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
 241/*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
 242/*
 243 *      sSS -> sIV      Server might not send FIN in this state.
 244 *      sS2 -> sIV
 245 *      sSR -> sFW      Close started.
 246 *      sES -> sFW
 247 *      sFW -> sLA      FIN seen in both directions.
 248 *      sCW -> sLA
 249 *      sLA -> sLA      Retransmitted FIN.
 250 *      sTW -> sTW
 251 *      sCL -> sCL
 252 */
 253/*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
 254/*ack*/    { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG },
 255/*
 256 *      sSS -> sIG      Might be a half-open connection.
 257 *      sS2 -> sIG
 258 *      sSR -> sSR      Might answer late resent SYN.
 259 *      sES -> sES      :-)
 260 *      sFW -> sCW      Normal close request answered by ACK.
 261 *      sCW -> sCW
 262 *      sLA -> sTW      Last ACK detected (RFC5961 challenged)
 263 *      sTW -> sTW      Retransmitted last ACK.
 264 *      sCL -> sCL
 265 */
 266/*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
 267/*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
 268/*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
 269        }
 270};
 271
 272#ifdef CONFIG_NF_CONNTRACK_PROCFS
 273/* Print out the private part of the conntrack. */
 274static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
 275{
 276        if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
 277                return;
 278
 279        seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]);
 280}
 281#endif
 282
 283static unsigned int get_conntrack_index(const struct tcphdr *tcph)
 284{
 285        if (tcph->rst) return TCP_RST_SET;
 286        else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
 287        else if (tcph->fin) return TCP_FIN_SET;
 288        else if (tcph->ack) return TCP_ACK_SET;
 289        else return TCP_NONE_SET;
 290}
 291
 292/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
 293   in IP Filter' by Guido van Rooij.
 294
 295   http://www.sane.nl/events/sane2000/papers.html
 296   http://www.darkart.com/mirrors/www.obfuscation.org/ipf/
 297
 298   The boundaries and the conditions are changed according to RFC793:
 299   the packet must intersect the window (i.e. segments may be
 300   after the right or before the left edge) and thus receivers may ACK
 301   segments after the right edge of the window.
 302
 303        td_maxend = max(sack + max(win,1)) seen in reply packets
 304        td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
 305        td_maxwin += seq + len - sender.td_maxend
 306                        if seq + len > sender.td_maxend
 307        td_end    = max(seq + len) seen in sent packets
 308
 309   I.   Upper bound for valid data:     seq <= sender.td_maxend
 310   II.  Lower bound for valid data:     seq + len >= sender.td_end - receiver.td_maxwin
 311   III. Upper bound for valid (s)ack:   sack <= receiver.td_end
 312   IV.  Lower bound for valid (s)ack:   sack >= receiver.td_end - MAXACKWINDOW
 313
 314   where sack is the highest right edge of sack block found in the packet
 315   or ack in the case of packet without SACK option.
 316
 317   The upper bound limit for a valid (s)ack is not ignored -
 318   we doesn't have to deal with fragments.
 319*/
 320
 321static inline __u32 segment_seq_plus_len(__u32 seq,
 322                                         size_t len,
 323                                         unsigned int dataoff,
 324                                         const struct tcphdr *tcph)
 325{
 326        /* XXX Should I use payload length field in IP/IPv6 header ?
 327         * - YK */
 328        return (seq + len - dataoff - tcph->doff*4
 329                + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
 330}
 331
 332/* Fixme: what about big packets? */
 333#define MAXACKWINCONST                  66000
 334#define MAXACKWINDOW(sender)                                            \
 335        ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin     \
 336                                              : MAXACKWINCONST)
 337
 338/*
 339 * Simplified tcp_parse_options routine from tcp_input.c
 340 */
 341static void tcp_options(const struct sk_buff *skb,
 342                        unsigned int dataoff,
 343                        const struct tcphdr *tcph,
 344                        struct ip_ct_tcp_state *state)
 345{
 346        unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
 347        const unsigned char *ptr;
 348        int length = (tcph->doff*4) - sizeof(struct tcphdr);
 349
 350        if (!length)
 351                return;
 352
 353        ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
 354                                 length, buff);
 355        BUG_ON(ptr == NULL);
 356
 357        state->td_scale =
 358        state->flags = 0;
 359
 360        while (length > 0) {
 361                int opcode=*ptr++;
 362                int opsize;
 363
 364                switch (opcode) {
 365                case TCPOPT_EOL:
 366                        return;
 367                case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
 368                        length--;
 369                        continue;
 370                default:
 371                        if (length < 2)
 372                                return;
 373                        opsize=*ptr++;
 374                        if (opsize < 2) /* "silly options" */
 375                                return;
 376                        if (opsize > length)
 377                                return; /* don't parse partial options */
 378
 379                        if (opcode == TCPOPT_SACK_PERM
 380                            && opsize == TCPOLEN_SACK_PERM)
 381                                state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
 382                        else if (opcode == TCPOPT_WINDOW
 383                                 && opsize == TCPOLEN_WINDOW) {
 384                                state->td_scale = *(u_int8_t *)ptr;
 385
 386                                if (state->td_scale > TCP_MAX_WSCALE)
 387                                        state->td_scale = TCP_MAX_WSCALE;
 388
 389                                state->flags |=
 390                                        IP_CT_TCP_FLAG_WINDOW_SCALE;
 391                        }
 392                        ptr += opsize - 2;
 393                        length -= opsize;
 394                }
 395        }
 396}
 397
 398static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
 399                     const struct tcphdr *tcph, __u32 *sack)
 400{
 401        unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
 402        const unsigned char *ptr;
 403        int length = (tcph->doff*4) - sizeof(struct tcphdr);
 404        __u32 tmp;
 405
 406        if (!length)
 407                return;
 408
 409        ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
 410                                 length, buff);
 411        BUG_ON(ptr == NULL);
 412
 413        /* Fast path for timestamp-only option */
 414        if (length == TCPOLEN_TSTAMP_ALIGNED
 415            && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24)
 416                                       | (TCPOPT_NOP << 16)
 417                                       | (TCPOPT_TIMESTAMP << 8)
 418                                       | TCPOLEN_TIMESTAMP))
 419                return;
 420
 421        while (length > 0) {
 422                int opcode = *ptr++;
 423                int opsize, i;
 424
 425                switch (opcode) {
 426                case TCPOPT_EOL:
 427                        return;
 428                case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
 429                        length--;
 430                        continue;
 431                default:
 432                        if (length < 2)
 433                                return;
 434                        opsize = *ptr++;
 435                        if (opsize < 2) /* "silly options" */
 436                                return;
 437                        if (opsize > length)
 438                                return; /* don't parse partial options */
 439
 440                        if (opcode == TCPOPT_SACK
 441                            && opsize >= (TCPOLEN_SACK_BASE
 442                                          + TCPOLEN_SACK_PERBLOCK)
 443                            && !((opsize - TCPOLEN_SACK_BASE)
 444                                 % TCPOLEN_SACK_PERBLOCK)) {
 445                                for (i = 0;
 446                                     i < (opsize - TCPOLEN_SACK_BASE);
 447                                     i += TCPOLEN_SACK_PERBLOCK) {
 448                                        tmp = get_unaligned_be32((__be32 *)(ptr+i)+1);
 449
 450                                        if (after(tmp, *sack))
 451                                                *sack = tmp;
 452                                }
 453                                return;
 454                        }
 455                        ptr += opsize - 2;
 456                        length -= opsize;
 457                }
 458        }
 459}
 460
 461static bool tcp_in_window(const struct nf_conn *ct,
 462                          struct ip_ct_tcp *state,
 463                          enum ip_conntrack_dir dir,
 464                          unsigned int index,
 465                          const struct sk_buff *skb,
 466                          unsigned int dataoff,
 467                          const struct tcphdr *tcph)
 468{
 469        struct net *net = nf_ct_net(ct);
 470        struct nf_tcp_net *tn = nf_tcp_pernet(net);
 471        struct ip_ct_tcp_state *sender = &state->seen[dir];
 472        struct ip_ct_tcp_state *receiver = &state->seen[!dir];
 473        const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
 474        __u32 seq, ack, sack, end, win, swin;
 475        u16 win_raw;
 476        s32 receiver_offset;
 477        bool res, in_recv_win;
 478
 479        /*
 480         * Get the required data from the packet.
 481         */
 482        seq = ntohl(tcph->seq);
 483        ack = sack = ntohl(tcph->ack_seq);
 484        win_raw = ntohs(tcph->window);
 485        win = win_raw;
 486        end = segment_seq_plus_len(seq, skb->len, dataoff, tcph);
 487
 488        if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
 489                tcp_sack(skb, dataoff, tcph, &sack);
 490
 491        /* Take into account NAT sequence number mangling */
 492        receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1);
 493        ack -= receiver_offset;
 494        sack -= receiver_offset;
 495
 496        pr_debug("tcp_in_window: START\n");
 497        pr_debug("tcp_in_window: ");
 498        nf_ct_dump_tuple(tuple);
 499        pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
 500                 seq, ack, receiver_offset, sack, receiver_offset, win, end);
 501        pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
 502                 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
 503                 sender->td_end, sender->td_maxend, sender->td_maxwin,
 504                 sender->td_scale,
 505                 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
 506                 receiver->td_scale);
 507
 508        if (sender->td_maxwin == 0) {
 509                /*
 510                 * Initialize sender data.
 511                 */
 512                if (tcph->syn) {
 513                        /*
 514                         * SYN-ACK in reply to a SYN
 515                         * or SYN from reply direction in simultaneous open.
 516                         */
 517                        sender->td_end =
 518                        sender->td_maxend = end;
 519                        sender->td_maxwin = (win == 0 ? 1 : win);
 520
 521                        tcp_options(skb, dataoff, tcph, sender);
 522                        /*
 523                         * RFC 1323:
 524                         * Both sides must send the Window Scale option
 525                         * to enable window scaling in either direction.
 526                         */
 527                        if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
 528                              && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
 529                                sender->td_scale =
 530                                receiver->td_scale = 0;
 531                        if (!tcph->ack)
 532                                /* Simultaneous open */
 533                                return true;
 534                } else {
 535                        /*
 536                         * We are in the middle of a connection,
 537                         * its history is lost for us.
 538                         * Let's try to use the data from the packet.
 539                         */
 540                        sender->td_end = end;
 541                        swin = win << sender->td_scale;
 542                        sender->td_maxwin = (swin == 0 ? 1 : swin);
 543                        sender->td_maxend = end + sender->td_maxwin;
 544                        /*
 545                         * We haven't seen traffic in the other direction yet
 546                         * but we have to tweak window tracking to pass III
 547                         * and IV until that happens.
 548                         */
 549                        if (receiver->td_maxwin == 0)
 550                                receiver->td_end = receiver->td_maxend = sack;
 551                }
 552        } else if (((state->state == TCP_CONNTRACK_SYN_SENT
 553                     && dir == IP_CT_DIR_ORIGINAL)
 554                   || (state->state == TCP_CONNTRACK_SYN_RECV
 555                     && dir == IP_CT_DIR_REPLY))
 556                   && after(end, sender->td_end)) {
 557                /*
 558                 * RFC 793: "if a TCP is reinitialized ... then it need
 559                 * not wait at all; it must only be sure to use sequence
 560                 * numbers larger than those recently used."
 561                 */
 562                sender->td_end =
 563                sender->td_maxend = end;
 564                sender->td_maxwin = (win == 0 ? 1 : win);
 565
 566                tcp_options(skb, dataoff, tcph, sender);
 567        }
 568
 569        if (!(tcph->ack)) {
 570                /*
 571                 * If there is no ACK, just pretend it was set and OK.
 572                 */
 573                ack = sack = receiver->td_end;
 574        } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
 575                    (TCP_FLAG_ACK|TCP_FLAG_RST))
 576                   && (ack == 0)) {
 577                /*
 578                 * Broken TCP stacks, that set ACK in RST packets as well
 579                 * with zero ack value.
 580                 */
 581                ack = sack = receiver->td_end;
 582        }
 583
 584        if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)
 585                /*
 586                 * RST sent answering SYN.
 587                 */
 588                seq = end = sender->td_end;
 589
 590        pr_debug("tcp_in_window: ");
 591        nf_ct_dump_tuple(tuple);
 592        pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
 593                 seq, ack, receiver_offset, sack, receiver_offset, win, end);
 594        pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
 595                 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
 596                 sender->td_end, sender->td_maxend, sender->td_maxwin,
 597                 sender->td_scale,
 598                 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
 599                 receiver->td_scale);
 600
 601        /* Is the ending sequence in the receive window (if available)? */
 602        in_recv_win = !receiver->td_maxwin ||
 603                      after(end, sender->td_end - receiver->td_maxwin - 1);
 604
 605        pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
 606                 before(seq, sender->td_maxend + 1),
 607                 (in_recv_win ? 1 : 0),
 608                 before(sack, receiver->td_end + 1),
 609                 after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1));
 610
 611        if (before(seq, sender->td_maxend + 1) &&
 612            in_recv_win &&
 613            before(sack, receiver->td_end + 1) &&
 614            after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) {
 615                /*
 616                 * Take into account window scaling (RFC 1323).
 617                 */
 618                if (!tcph->syn)
 619                        win <<= sender->td_scale;
 620
 621                /*
 622                 * Update sender data.
 623                 */
 624                swin = win + (sack - ack);
 625                if (sender->td_maxwin < swin)
 626                        sender->td_maxwin = swin;
 627                if (after(end, sender->td_end)) {
 628                        sender->td_end = end;
 629                        sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
 630                }
 631                if (tcph->ack) {
 632                        if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
 633                                sender->td_maxack = ack;
 634                                sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
 635                        } else if (after(ack, sender->td_maxack))
 636                                sender->td_maxack = ack;
 637                }
 638
 639                /*
 640                 * Update receiver data.
 641                 */
 642                if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
 643                        receiver->td_maxwin += end - sender->td_maxend;
 644                if (after(sack + win, receiver->td_maxend - 1)) {
 645                        receiver->td_maxend = sack + win;
 646                        if (win == 0)
 647                                receiver->td_maxend++;
 648                }
 649                if (ack == receiver->td_end)
 650                        receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
 651
 652                /*
 653                 * Check retransmissions.
 654                 */
 655                if (index == TCP_ACK_SET) {
 656                        if (state->last_dir == dir
 657                            && state->last_seq == seq
 658                            && state->last_ack == ack
 659                            && state->last_end == end
 660                            && state->last_win == win_raw)
 661                                state->retrans++;
 662                        else {
 663                                state->last_dir = dir;
 664                                state->last_seq = seq;
 665                                state->last_ack = ack;
 666                                state->last_end = end;
 667                                state->last_win = win_raw;
 668                                state->retrans = 0;
 669                        }
 670                }
 671                res = true;
 672        } else {
 673                res = false;
 674                if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
 675                    tn->tcp_be_liberal)
 676                        res = true;
 677                if (!res) {
 678                        nf_ct_l4proto_log_invalid(skb, ct,
 679                        "%s",
 680                        before(seq, sender->td_maxend + 1) ?
 681                        in_recv_win ?
 682                        before(sack, receiver->td_end + 1) ?
 683                        after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG"
 684                        : "ACK is under the lower bound (possible overly delayed ACK)"
 685                        : "ACK is over the upper bound (ACKed data not seen yet)"
 686                        : "SEQ is under the lower bound (already ACKed data retransmitted)"
 687                        : "SEQ is over the upper bound (over the window of the receiver)");
 688                }
 689        }
 690
 691        pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u "
 692                 "receiver end=%u maxend=%u maxwin=%u\n",
 693                 res, sender->td_end, sender->td_maxend, sender->td_maxwin,
 694                 receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
 695
 696        return res;
 697}
 698
 699/* table of valid flag combinations - PUSH, ECE and CWR are always valid */
 700static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
 701                                 TCPHDR_URG) + 1] =
 702{
 703        [TCPHDR_SYN]                            = 1,
 704        [TCPHDR_SYN|TCPHDR_URG]                 = 1,
 705        [TCPHDR_SYN|TCPHDR_ACK]                 = 1,
 706        [TCPHDR_RST]                            = 1,
 707        [TCPHDR_RST|TCPHDR_ACK]                 = 1,
 708        [TCPHDR_FIN|TCPHDR_ACK]                 = 1,
 709        [TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG]      = 1,
 710        [TCPHDR_ACK]                            = 1,
 711        [TCPHDR_ACK|TCPHDR_URG]                 = 1,
 712};
 713
 714static void tcp_error_log(const struct sk_buff *skb,
 715                          const struct nf_hook_state *state,
 716                          const char *msg)
 717{
 718        nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_TCP, "%s", msg);
 719}
 720
 721/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c.  */
 722static bool tcp_error(const struct tcphdr *th,
 723                      struct sk_buff *skb,
 724                      unsigned int dataoff,
 725                      const struct nf_hook_state *state)
 726{
 727        unsigned int tcplen = skb->len - dataoff;
 728        u8 tcpflags;
 729
 730        /* Not whole TCP header or malformed packet */
 731        if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
 732                tcp_error_log(skb, state, "truncated packet");
 733                return true;
 734        }
 735
 736        /* Checksum invalid? Ignore.
 737         * We skip checking packets on the outgoing path
 738         * because the checksum is assumed to be correct.
 739         */
 740        /* FIXME: Source route IP option packets --RR */
 741        if (state->net->ct.sysctl_checksum &&
 742            state->hook == NF_INET_PRE_ROUTING &&
 743            nf_checksum(skb, state->hook, dataoff, IPPROTO_TCP, state->pf)) {
 744                tcp_error_log(skb, state, "bad checksum");
 745                return true;
 746        }
 747
 748        /* Check TCP flags. */
 749        tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
 750        if (!tcp_valid_flags[tcpflags]) {
 751                tcp_error_log(skb, state, "invalid tcp flag combination");
 752                return true;
 753        }
 754
 755        return false;
 756}
 757
 758static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
 759                             unsigned int dataoff,
 760                             const struct tcphdr *th)
 761{
 762        enum tcp_conntrack new_state;
 763        struct net *net = nf_ct_net(ct);
 764        const struct nf_tcp_net *tn = nf_tcp_pernet(net);
 765        const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
 766        const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
 767
 768        /* Don't need lock here: this conntrack not in circulation yet */
 769        new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
 770
 771        /* Invalid: delete conntrack */
 772        if (new_state >= TCP_CONNTRACK_MAX) {
 773                pr_debug("nf_ct_tcp: invalid new deleting.\n");
 774                return false;
 775        }
 776
 777        if (new_state == TCP_CONNTRACK_SYN_SENT) {
 778                memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
 779                /* SYN packet */
 780                ct->proto.tcp.seen[0].td_end =
 781                        segment_seq_plus_len(ntohl(th->seq), skb->len,
 782                                             dataoff, th);
 783                ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
 784                if (ct->proto.tcp.seen[0].td_maxwin == 0)
 785                        ct->proto.tcp.seen[0].td_maxwin = 1;
 786                ct->proto.tcp.seen[0].td_maxend =
 787                        ct->proto.tcp.seen[0].td_end;
 788
 789                tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
 790        } else if (tn->tcp_loose == 0) {
 791                /* Don't try to pick up connections. */
 792                return false;
 793        } else {
 794                memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
 795                /*
 796                 * We are in the middle of a connection,
 797                 * its history is lost for us.
 798                 * Let's try to use the data from the packet.
 799                 */
 800                ct->proto.tcp.seen[0].td_end =
 801                        segment_seq_plus_len(ntohl(th->seq), skb->len,
 802                                             dataoff, th);
 803                ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
 804                if (ct->proto.tcp.seen[0].td_maxwin == 0)
 805                        ct->proto.tcp.seen[0].td_maxwin = 1;
 806                ct->proto.tcp.seen[0].td_maxend =
 807                        ct->proto.tcp.seen[0].td_end +
 808                        ct->proto.tcp.seen[0].td_maxwin;
 809
 810                /* We assume SACK and liberal window checking to handle
 811                 * window scaling */
 812                ct->proto.tcp.seen[0].flags =
 813                ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM |
 814                                              IP_CT_TCP_FLAG_BE_LIBERAL;
 815        }
 816
 817        /* tcp_packet will set them */
 818        ct->proto.tcp.last_index = TCP_NONE_SET;
 819
 820        pr_debug("%s: sender end=%u maxend=%u maxwin=%u scale=%i "
 821                 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
 822                 __func__,
 823                 sender->td_end, sender->td_maxend, sender->td_maxwin,
 824                 sender->td_scale,
 825                 receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
 826                 receiver->td_scale);
 827        return true;
 828}
 829
 830static bool nf_conntrack_tcp_established(const struct nf_conn *ct)
 831{
 832        return ct->proto.tcp.state == TCP_CONNTRACK_ESTABLISHED &&
 833               test_bit(IPS_ASSURED_BIT, &ct->status);
 834}
 835
 836/* Returns verdict for packet, or -1 for invalid. */
 837int nf_conntrack_tcp_packet(struct nf_conn *ct,
 838                            struct sk_buff *skb,
 839                            unsigned int dataoff,
 840                            enum ip_conntrack_info ctinfo,
 841                            const struct nf_hook_state *state)
 842{
 843        struct net *net = nf_ct_net(ct);
 844        struct nf_tcp_net *tn = nf_tcp_pernet(net);
 845        struct nf_conntrack_tuple *tuple;
 846        enum tcp_conntrack new_state, old_state;
 847        unsigned int index, *timeouts;
 848        enum ip_conntrack_dir dir;
 849        const struct tcphdr *th;
 850        struct tcphdr _tcph;
 851        unsigned long timeout;
 852
 853        th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
 854        if (th == NULL)
 855                return -NF_ACCEPT;
 856
 857        if (tcp_error(th, skb, dataoff, state))
 858                return -NF_ACCEPT;
 859
 860        if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th))
 861                return -NF_ACCEPT;
 862
 863        spin_lock_bh(&ct->lock);
 864        old_state = ct->proto.tcp.state;
 865        dir = CTINFO2DIR(ctinfo);
 866        index = get_conntrack_index(th);
 867        new_state = tcp_conntracks[dir][index][old_state];
 868        tuple = &ct->tuplehash[dir].tuple;
 869
 870        switch (new_state) {
 871        case TCP_CONNTRACK_SYN_SENT:
 872                if (old_state < TCP_CONNTRACK_TIME_WAIT)
 873                        break;
 874                /* RFC 1122: "When a connection is closed actively,
 875                 * it MUST linger in TIME-WAIT state for a time 2xMSL
 876                 * (Maximum Segment Lifetime). However, it MAY accept
 877                 * a new SYN from the remote TCP to reopen the connection
 878                 * directly from TIME-WAIT state, if..."
 879                 * We ignore the conditions because we are in the
 880                 * TIME-WAIT state anyway.
 881                 *
 882                 * Handle aborted connections: we and the server
 883                 * think there is an existing connection but the client
 884                 * aborts it and starts a new one.
 885                 */
 886                if (((ct->proto.tcp.seen[dir].flags
 887                      | ct->proto.tcp.seen[!dir].flags)
 888                     & IP_CT_TCP_FLAG_CLOSE_INIT)
 889                    || (ct->proto.tcp.last_dir == dir
 890                        && ct->proto.tcp.last_index == TCP_RST_SET)) {
 891                        /* Attempt to reopen a closed/aborted connection.
 892                         * Delete this connection and look up again. */
 893                        spin_unlock_bh(&ct->lock);
 894
 895                        /* Only repeat if we can actually remove the timer.
 896                         * Destruction may already be in progress in process
 897                         * context and we must give it a chance to terminate.
 898                         */
 899                        if (nf_ct_kill(ct))
 900                                return -NF_REPEAT;
 901                        return NF_DROP;
 902                }
 903                fallthrough;
 904        case TCP_CONNTRACK_IGNORE:
 905                /* Ignored packets:
 906                 *
 907                 * Our connection entry may be out of sync, so ignore
 908                 * packets which may signal the real connection between
 909                 * the client and the server.
 910                 *
 911                 * a) SYN in ORIGINAL
 912                 * b) SYN/ACK in REPLY
 913                 * c) ACK in reply direction after initial SYN in original.
 914                 *
 915                 * If the ignored packet is invalid, the receiver will send
 916                 * a RST we'll catch below.
 917                 */
 918                if (index == TCP_SYNACK_SET
 919                    && ct->proto.tcp.last_index == TCP_SYN_SET
 920                    && ct->proto.tcp.last_dir != dir
 921                    && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
 922                        /* b) This SYN/ACK acknowledges a SYN that we earlier
 923                         * ignored as invalid. This means that the client and
 924                         * the server are both in sync, while the firewall is
 925                         * not. We get in sync from the previously annotated
 926                         * values.
 927                         */
 928                        old_state = TCP_CONNTRACK_SYN_SENT;
 929                        new_state = TCP_CONNTRACK_SYN_RECV;
 930                        ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end =
 931                                ct->proto.tcp.last_end;
 932                        ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend =
 933                                ct->proto.tcp.last_end;
 934                        ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin =
 935                                ct->proto.tcp.last_win == 0 ?
 936                                        1 : ct->proto.tcp.last_win;
 937                        ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale =
 938                                ct->proto.tcp.last_wscale;
 939                        ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
 940                        ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
 941                                ct->proto.tcp.last_flags;
 942                        memset(&ct->proto.tcp.seen[dir], 0,
 943                               sizeof(struct ip_ct_tcp_state));
 944                        break;
 945                }
 946                ct->proto.tcp.last_index = index;
 947                ct->proto.tcp.last_dir = dir;
 948                ct->proto.tcp.last_seq = ntohl(th->seq);
 949                ct->proto.tcp.last_end =
 950                    segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
 951                ct->proto.tcp.last_win = ntohs(th->window);
 952
 953                /* a) This is a SYN in ORIGINAL. The client and the server
 954                 * may be in sync but we are not. In that case, we annotate
 955                 * the TCP options and let the packet go through. If it is a
 956                 * valid SYN packet, the server will reply with a SYN/ACK, and
 957                 * then we'll get in sync. Otherwise, the server potentially
 958                 * responds with a challenge ACK if implementing RFC5961.
 959                 */
 960                if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) {
 961                        struct ip_ct_tcp_state seen = {};
 962
 963                        ct->proto.tcp.last_flags =
 964                        ct->proto.tcp.last_wscale = 0;
 965                        tcp_options(skb, dataoff, th, &seen);
 966                        if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
 967                                ct->proto.tcp.last_flags |=
 968                                        IP_CT_TCP_FLAG_WINDOW_SCALE;
 969                                ct->proto.tcp.last_wscale = seen.td_scale;
 970                        }
 971                        if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) {
 972                                ct->proto.tcp.last_flags |=
 973                                        IP_CT_TCP_FLAG_SACK_PERM;
 974                        }
 975                        /* Mark the potential for RFC5961 challenge ACK,
 976                         * this pose a special problem for LAST_ACK state
 977                         * as ACK is intrepretated as ACKing last FIN.
 978                         */
 979                        if (old_state == TCP_CONNTRACK_LAST_ACK)
 980                                ct->proto.tcp.last_flags |=
 981                                        IP_CT_EXP_CHALLENGE_ACK;
 982                }
 983                spin_unlock_bh(&ct->lock);
 984                nf_ct_l4proto_log_invalid(skb, ct, "invalid packet ignored in "
 985                                          "state %s ", tcp_conntrack_names[old_state]);
 986                return NF_ACCEPT;
 987        case TCP_CONNTRACK_MAX:
 988                /* Special case for SYN proxy: when the SYN to the server or
 989                 * the SYN/ACK from the server is lost, the client may transmit
 990                 * a keep-alive packet while in SYN_SENT state. This needs to
 991                 * be associated with the original conntrack entry in order to
 992                 * generate a new SYN with the correct sequence number.
 993                 */
 994                if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT &&
 995                    index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL &&
 996                    ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL &&
 997                    ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) {
 998                        pr_debug("nf_ct_tcp: SYN proxy client keep alive\n");
 999                        spin_unlock_bh(&ct->lock);
1000                        return NF_ACCEPT;
1001                }
1002
1003                /* Invalid packet */
1004                pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
1005                         dir, get_conntrack_index(th), old_state);
1006                spin_unlock_bh(&ct->lock);
1007                nf_ct_l4proto_log_invalid(skb, ct, "invalid state");
1008                return -NF_ACCEPT;
1009        case TCP_CONNTRACK_TIME_WAIT:
1010                /* RFC5961 compliance cause stack to send "challenge-ACK"
1011                 * e.g. in response to spurious SYNs.  Conntrack MUST
1012                 * not believe this ACK is acking last FIN.
1013                 */
1014                if (old_state == TCP_CONNTRACK_LAST_ACK &&
1015                    index == TCP_ACK_SET &&
1016                    ct->proto.tcp.last_dir != dir &&
1017                    ct->proto.tcp.last_index == TCP_SYN_SET &&
1018                    (ct->proto.tcp.last_flags & IP_CT_EXP_CHALLENGE_ACK)) {
1019                        /* Detected RFC5961 challenge ACK */
1020                        ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
1021                        spin_unlock_bh(&ct->lock);
1022                        nf_ct_l4proto_log_invalid(skb, ct, "challenge-ack ignored");
1023                        return NF_ACCEPT; /* Don't change state */
1024                }
1025                break;
1026        case TCP_CONNTRACK_SYN_SENT2:
1027                /* tcp_conntracks table is not smart enough to handle
1028                 * simultaneous open.
1029                 */
1030                ct->proto.tcp.last_flags |= IP_CT_TCP_SIMULTANEOUS_OPEN;
1031                break;
1032        case TCP_CONNTRACK_SYN_RECV:
1033                if (dir == IP_CT_DIR_REPLY && index == TCP_ACK_SET &&
1034                    ct->proto.tcp.last_flags & IP_CT_TCP_SIMULTANEOUS_OPEN)
1035                        new_state = TCP_CONNTRACK_ESTABLISHED;
1036                break;
1037        case TCP_CONNTRACK_CLOSE:
1038                if (index != TCP_RST_SET)
1039                        break;
1040
1041                if (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) {
1042                        u32 seq = ntohl(th->seq);
1043
1044                        if (before(seq, ct->proto.tcp.seen[!dir].td_maxack)) {
1045                                /* Invalid RST  */
1046                                spin_unlock_bh(&ct->lock);
1047                                nf_ct_l4proto_log_invalid(skb, ct, "invalid rst");
1048                                return -NF_ACCEPT;
1049                        }
1050
1051                        if (!nf_conntrack_tcp_established(ct) ||
1052                            seq == ct->proto.tcp.seen[!dir].td_maxack)
1053                                break;
1054
1055                        /* Check if rst is part of train, such as
1056                         *   foo:80 > bar:4379: P, 235946583:235946602(19) ack 42
1057                         *   foo:80 > bar:4379: R, 235946602:235946602(0)  ack 42
1058                         */
1059                        if (ct->proto.tcp.last_index == TCP_ACK_SET &&
1060                            ct->proto.tcp.last_dir == dir &&
1061                            seq == ct->proto.tcp.last_end)
1062                                break;
1063
1064                        /* ... RST sequence number doesn't match exactly, keep
1065                         * established state to allow a possible challenge ACK.
1066                         */
1067                        new_state = old_state;
1068                }
1069                if (((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
1070                         && ct->proto.tcp.last_index == TCP_SYN_SET)
1071                        || (!test_bit(IPS_ASSURED_BIT, &ct->status)
1072                            && ct->proto.tcp.last_index == TCP_ACK_SET))
1073                    && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
1074                        /* RST sent to invalid SYN or ACK we had let through
1075                         * at a) and c) above:
1076                         *
1077                         * a) SYN was in window then
1078                         * c) we hold a half-open connection.
1079                         *
1080                         * Delete our connection entry.
1081                         * We skip window checking, because packet might ACK
1082                         * segments we ignored. */
1083                        goto in_window;
1084                }
1085                break;
1086        default:
1087                /* Keep compilers happy. */
1088                break;
1089        }
1090
1091        if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
1092                           skb, dataoff, th)) {
1093                spin_unlock_bh(&ct->lock);
1094                return -NF_ACCEPT;
1095        }
1096     in_window:
1097        /* From now on we have got in-window packets */
1098        ct->proto.tcp.last_index = index;
1099        ct->proto.tcp.last_dir = dir;
1100
1101        pr_debug("tcp_conntracks: ");
1102        nf_ct_dump_tuple(tuple);
1103        pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
1104                 (th->syn ? 1 : 0), (th->ack ? 1 : 0),
1105                 (th->fin ? 1 : 0), (th->rst ? 1 : 0),
1106                 old_state, new_state);
1107
1108        ct->proto.tcp.state = new_state;
1109        if (old_state != new_state
1110            && new_state == TCP_CONNTRACK_FIN_WAIT)
1111                ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
1112
1113        timeouts = nf_ct_timeout_lookup(ct);
1114        if (!timeouts)
1115                timeouts = tn->timeouts;
1116
1117        if (ct->proto.tcp.retrans >= tn->tcp_max_retrans &&
1118            timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
1119                timeout = timeouts[TCP_CONNTRACK_RETRANS];
1120        else if (unlikely(index == TCP_RST_SET))
1121                timeout = timeouts[TCP_CONNTRACK_CLOSE];
1122        else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) &
1123                 IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED &&
1124                 timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK])
1125                timeout = timeouts[TCP_CONNTRACK_UNACK];
1126        else if (ct->proto.tcp.last_win == 0 &&
1127                 timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
1128                timeout = timeouts[TCP_CONNTRACK_RETRANS];
1129        else
1130                timeout = timeouts[new_state];
1131        spin_unlock_bh(&ct->lock);
1132
1133        if (new_state != old_state)
1134                nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
1135
1136        if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1137                /* If only reply is a RST, we can consider ourselves not to
1138                   have an established connection: this is a fairly common
1139                   problem case, so we can delete the conntrack
1140                   immediately.  --RR */
1141                if (th->rst) {
1142                        nf_ct_kill_acct(ct, ctinfo, skb);
1143                        return NF_ACCEPT;
1144                }
1145                /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
1146                 * pickup with loose=1. Avoid large ESTABLISHED timeout.
1147                 */
1148                if (new_state == TCP_CONNTRACK_ESTABLISHED &&
1149                    timeout > timeouts[TCP_CONNTRACK_UNACK])
1150                        timeout = timeouts[TCP_CONNTRACK_UNACK];
1151        } else if (!test_bit(IPS_ASSURED_BIT, &ct->status)
1152                   && (old_state == TCP_CONNTRACK_SYN_RECV
1153                       || old_state == TCP_CONNTRACK_ESTABLISHED)
1154                   && new_state == TCP_CONNTRACK_ESTABLISHED) {
1155                /* Set ASSURED if we see valid ack in ESTABLISHED
1156                   after SYN_RECV or a valid answer for a picked up
1157                   connection. */
1158                set_bit(IPS_ASSURED_BIT, &ct->status);
1159                nf_conntrack_event_cache(IPCT_ASSURED, ct);
1160        }
1161        nf_ct_refresh_acct(ct, ctinfo, skb, timeout);
1162
1163        return NF_ACCEPT;
1164}
1165
1166static bool tcp_can_early_drop(const struct nf_conn *ct)
1167{
1168        switch (ct->proto.tcp.state) {
1169        case TCP_CONNTRACK_FIN_WAIT:
1170        case TCP_CONNTRACK_LAST_ACK:
1171        case TCP_CONNTRACK_TIME_WAIT:
1172        case TCP_CONNTRACK_CLOSE:
1173        case TCP_CONNTRACK_CLOSE_WAIT:
1174                return true;
1175        default:
1176                break;
1177        }
1178
1179        return false;
1180}
1181
1182#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1183
1184#include <linux/netfilter/nfnetlink.h>
1185#include <linux/netfilter/nfnetlink_conntrack.h>
1186
1187static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
1188                         struct nf_conn *ct)
1189{
1190        struct nlattr *nest_parms;
1191        struct nf_ct_tcp_flags tmp = {};
1192
1193        spin_lock_bh(&ct->lock);
1194        nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP);
1195        if (!nest_parms)
1196                goto nla_put_failure;
1197
1198        if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state) ||
1199            nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
1200                       ct->proto.tcp.seen[0].td_scale) ||
1201            nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY,
1202                       ct->proto.tcp.seen[1].td_scale))
1203                goto nla_put_failure;
1204
1205        tmp.flags = ct->proto.tcp.seen[0].flags;
1206        if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL,
1207                    sizeof(struct nf_ct_tcp_flags), &tmp))
1208                goto nla_put_failure;
1209
1210        tmp.flags = ct->proto.tcp.seen[1].flags;
1211        if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
1212                    sizeof(struct nf_ct_tcp_flags), &tmp))
1213                goto nla_put_failure;
1214        spin_unlock_bh(&ct->lock);
1215
1216        nla_nest_end(skb, nest_parms);
1217
1218        return 0;
1219
1220nla_put_failure:
1221        spin_unlock_bh(&ct->lock);
1222        return -1;
1223}
1224
1225static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = {
1226        [CTA_PROTOINFO_TCP_STATE]           = { .type = NLA_U8 },
1227        [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 },
1228        [CTA_PROTOINFO_TCP_WSCALE_REPLY]    = { .type = NLA_U8 },
1229        [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]  = { .len = sizeof(struct nf_ct_tcp_flags) },
1230        [CTA_PROTOINFO_TCP_FLAGS_REPLY]     = { .len = sizeof(struct nf_ct_tcp_flags) },
1231};
1232
1233#define TCP_NLATTR_SIZE ( \
1234        NLA_ALIGN(NLA_HDRLEN + 1) + \
1235        NLA_ALIGN(NLA_HDRLEN + 1) + \
1236        NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)) + \
1237        NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)))
1238
1239static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
1240{
1241        struct nlattr *pattr = cda[CTA_PROTOINFO_TCP];
1242        struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1];
1243        int err;
1244
1245        /* updates could not contain anything about the private
1246         * protocol info, in that case skip the parsing */
1247        if (!pattr)
1248                return 0;
1249
1250        err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_TCP_MAX, pattr,
1251                                          tcp_nla_policy, NULL);
1252        if (err < 0)
1253                return err;
1254
1255        if (tb[CTA_PROTOINFO_TCP_STATE] &&
1256            nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
1257                return -EINVAL;
1258
1259        spin_lock_bh(&ct->lock);
1260        if (tb[CTA_PROTOINFO_TCP_STATE])
1261                ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
1262
1263        if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) {
1264                struct nf_ct_tcp_flags *attr =
1265                        nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]);
1266                ct->proto.tcp.seen[0].flags &= ~attr->mask;
1267                ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask;
1268        }
1269
1270        if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]) {
1271                struct nf_ct_tcp_flags *attr =
1272                        nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]);
1273                ct->proto.tcp.seen[1].flags &= ~attr->mask;
1274                ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask;
1275        }
1276
1277        if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] &&
1278            tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] &&
1279            ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
1280            ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
1281                ct->proto.tcp.seen[0].td_scale =
1282                        nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]);
1283                ct->proto.tcp.seen[1].td_scale =
1284                        nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
1285        }
1286        spin_unlock_bh(&ct->lock);
1287
1288        return 0;
1289}
1290
1291static unsigned int tcp_nlattr_tuple_size(void)
1292{
1293        static unsigned int size __read_mostly;
1294
1295        if (!size)
1296                size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
1297
1298        return size;
1299}
1300#endif
1301
1302#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
1303
1304#include <linux/netfilter/nfnetlink.h>
1305#include <linux/netfilter/nfnetlink_cttimeout.h>
1306
1307static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[],
1308                                     struct net *net, void *data)
1309{
1310        struct nf_tcp_net *tn = nf_tcp_pernet(net);
1311        unsigned int *timeouts = data;
1312        int i;
1313
1314        if (!timeouts)
1315                timeouts = tn->timeouts;
1316        /* set default TCP timeouts. */
1317        for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++)
1318                timeouts[i] = tn->timeouts[i];
1319
1320        if (tb[CTA_TIMEOUT_TCP_SYN_SENT]) {
1321                timeouts[TCP_CONNTRACK_SYN_SENT] =
1322                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ;
1323        }
1324
1325        if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) {
1326                timeouts[TCP_CONNTRACK_SYN_RECV] =
1327                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ;
1328        }
1329        if (tb[CTA_TIMEOUT_TCP_ESTABLISHED]) {
1330                timeouts[TCP_CONNTRACK_ESTABLISHED] =
1331                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_ESTABLISHED]))*HZ;
1332        }
1333        if (tb[CTA_TIMEOUT_TCP_FIN_WAIT]) {
1334                timeouts[TCP_CONNTRACK_FIN_WAIT] =
1335                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_FIN_WAIT]))*HZ;
1336        }
1337        if (tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]) {
1338                timeouts[TCP_CONNTRACK_CLOSE_WAIT] =
1339                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]))*HZ;
1340        }
1341        if (tb[CTA_TIMEOUT_TCP_LAST_ACK]) {
1342                timeouts[TCP_CONNTRACK_LAST_ACK] =
1343                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_LAST_ACK]))*HZ;
1344        }
1345        if (tb[CTA_TIMEOUT_TCP_TIME_WAIT]) {
1346                timeouts[TCP_CONNTRACK_TIME_WAIT] =
1347                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_TIME_WAIT]))*HZ;
1348        }
1349        if (tb[CTA_TIMEOUT_TCP_CLOSE]) {
1350                timeouts[TCP_CONNTRACK_CLOSE] =
1351                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE]))*HZ;
1352        }
1353        if (tb[CTA_TIMEOUT_TCP_SYN_SENT2]) {
1354                timeouts[TCP_CONNTRACK_SYN_SENT2] =
1355                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT2]))*HZ;
1356        }
1357        if (tb[CTA_TIMEOUT_TCP_RETRANS]) {
1358                timeouts[TCP_CONNTRACK_RETRANS] =
1359                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_RETRANS]))*HZ;
1360        }
1361        if (tb[CTA_TIMEOUT_TCP_UNACK]) {
1362                timeouts[TCP_CONNTRACK_UNACK] =
1363                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ;
1364        }
1365
1366        timeouts[CTA_TIMEOUT_TCP_UNSPEC] = timeouts[CTA_TIMEOUT_TCP_SYN_SENT];
1367        return 0;
1368}
1369
1370static int
1371tcp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
1372{
1373        const unsigned int *timeouts = data;
1374
1375        if (nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT,
1376                        htonl(timeouts[TCP_CONNTRACK_SYN_SENT] / HZ)) ||
1377            nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_RECV,
1378                         htonl(timeouts[TCP_CONNTRACK_SYN_RECV] / HZ)) ||
1379            nla_put_be32(skb, CTA_TIMEOUT_TCP_ESTABLISHED,
1380                         htonl(timeouts[TCP_CONNTRACK_ESTABLISHED] / HZ)) ||
1381            nla_put_be32(skb, CTA_TIMEOUT_TCP_FIN_WAIT,
1382                         htonl(timeouts[TCP_CONNTRACK_FIN_WAIT] / HZ)) ||
1383            nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE_WAIT,
1384                         htonl(timeouts[TCP_CONNTRACK_CLOSE_WAIT] / HZ)) ||
1385            nla_put_be32(skb, CTA_TIMEOUT_TCP_LAST_ACK,
1386                         htonl(timeouts[TCP_CONNTRACK_LAST_ACK] / HZ)) ||
1387            nla_put_be32(skb, CTA_TIMEOUT_TCP_TIME_WAIT,
1388                         htonl(timeouts[TCP_CONNTRACK_TIME_WAIT] / HZ)) ||
1389            nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE,
1390                         htonl(timeouts[TCP_CONNTRACK_CLOSE] / HZ)) ||
1391            nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT2,
1392                         htonl(timeouts[TCP_CONNTRACK_SYN_SENT2] / HZ)) ||
1393            nla_put_be32(skb, CTA_TIMEOUT_TCP_RETRANS,
1394                         htonl(timeouts[TCP_CONNTRACK_RETRANS] / HZ)) ||
1395            nla_put_be32(skb, CTA_TIMEOUT_TCP_UNACK,
1396                         htonl(timeouts[TCP_CONNTRACK_UNACK] / HZ)))
1397                goto nla_put_failure;
1398        return 0;
1399
1400nla_put_failure:
1401        return -ENOSPC;
1402}
1403
1404static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = {
1405        [CTA_TIMEOUT_TCP_SYN_SENT]      = { .type = NLA_U32 },
1406        [CTA_TIMEOUT_TCP_SYN_RECV]      = { .type = NLA_U32 },
1407        [CTA_TIMEOUT_TCP_ESTABLISHED]   = { .type = NLA_U32 },
1408        [CTA_TIMEOUT_TCP_FIN_WAIT]      = { .type = NLA_U32 },
1409        [CTA_TIMEOUT_TCP_CLOSE_WAIT]    = { .type = NLA_U32 },
1410        [CTA_TIMEOUT_TCP_LAST_ACK]      = { .type = NLA_U32 },
1411        [CTA_TIMEOUT_TCP_TIME_WAIT]     = { .type = NLA_U32 },
1412        [CTA_TIMEOUT_TCP_CLOSE]         = { .type = NLA_U32 },
1413        [CTA_TIMEOUT_TCP_SYN_SENT2]     = { .type = NLA_U32 },
1414        [CTA_TIMEOUT_TCP_RETRANS]       = { .type = NLA_U32 },
1415        [CTA_TIMEOUT_TCP_UNACK]         = { .type = NLA_U32 },
1416};
1417#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
1418
1419void nf_conntrack_tcp_init_net(struct net *net)
1420{
1421        struct nf_tcp_net *tn = nf_tcp_pernet(net);
1422        int i;
1423
1424        for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++)
1425                tn->timeouts[i] = tcp_timeouts[i];
1426
1427        /* timeouts[0] is unused, make it same as SYN_SENT so
1428         * ->timeouts[0] contains 'new' timeout, like udp or icmp.
1429         */
1430        tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT];
1431        tn->tcp_loose = nf_ct_tcp_loose;
1432        tn->tcp_be_liberal = nf_ct_tcp_be_liberal;
1433        tn->tcp_max_retrans = nf_ct_tcp_max_retrans;
1434}
1435
1436const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp =
1437{
1438        .l4proto                = IPPROTO_TCP,
1439#ifdef CONFIG_NF_CONNTRACK_PROCFS
1440        .print_conntrack        = tcp_print_conntrack,
1441#endif
1442        .can_early_drop         = tcp_can_early_drop,
1443#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1444        .to_nlattr              = tcp_to_nlattr,
1445        .from_nlattr            = nlattr_to_tcp,
1446        .tuple_to_nlattr        = nf_ct_port_tuple_to_nlattr,
1447        .nlattr_to_tuple        = nf_ct_port_nlattr_to_tuple,
1448        .nlattr_tuple_size      = tcp_nlattr_tuple_size,
1449        .nlattr_size            = TCP_NLATTR_SIZE,
1450        .nla_policy             = nf_ct_port_nla_policy,
1451#endif
1452#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
1453        .ctnl_timeout           = {
1454                .nlattr_to_obj  = tcp_timeout_nlattr_to_obj,
1455                .obj_to_nlattr  = tcp_timeout_obj_to_nlattr,
1456                .nlattr_max     = CTA_TIMEOUT_TCP_MAX,
1457                .obj_size       = sizeof(unsigned int) *
1458                                        TCP_CONNTRACK_TIMEOUT_MAX,
1459                .nla_policy     = tcp_timeout_nla_policy,
1460        },
1461#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
1462};
1463