1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64#define pr_fmt(fmt) "TCP: " fmt
65
66#include <linux/mm.h>
67#include <linux/slab.h>
68#include <linux/module.h>
69#include <linux/sysctl.h>
70#include <linux/kernel.h>
71#include <linux/prefetch.h>
72#include <net/dst.h>
73#include <net/tcp.h>
74#include <net/inet_common.h>
75#include <linux/ipsec.h>
76#include <asm/unaligned.h>
77#include <linux/errqueue.h>
78
79int sysctl_tcp_timestamps __read_mostly = 1;
80int sysctl_tcp_window_scaling __read_mostly = 1;
81int sysctl_tcp_sack __read_mostly = 1;
82int sysctl_tcp_fack __read_mostly = 1;
83int sysctl_tcp_max_reordering __read_mostly = 300;
84int sysctl_tcp_dsack __read_mostly = 1;
85int sysctl_tcp_app_win __read_mostly = 31;
86int sysctl_tcp_adv_win_scale __read_mostly = 1;
87EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
88
89
90int sysctl_tcp_challenge_ack_limit = 100;
91
92int sysctl_tcp_stdurg __read_mostly;
93int sysctl_tcp_rfc1337 __read_mostly;
94int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
95int sysctl_tcp_frto __read_mostly = 2;
96int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
97
98int sysctl_tcp_thin_dupack __read_mostly;
99
100int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
101int sysctl_tcp_early_retrans __read_mostly = 3;
102int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
103
104#define FLAG_DATA 0x01
105#define FLAG_WIN_UPDATE 0x02
106#define FLAG_DATA_ACKED 0x04
107#define FLAG_RETRANS_DATA_ACKED 0x08
108#define FLAG_SYN_ACKED 0x10
109#define FLAG_DATA_SACKED 0x20
110#define FLAG_ECE 0x40
111#define FLAG_LOST_RETRANS 0x80
112#define FLAG_SLOWPATH 0x100
113#define FLAG_ORIG_SACK_ACKED 0x200
114#define FLAG_SND_UNA_ADVANCED 0x400
115#define FLAG_DSACKING_ACK 0x800
116#define FLAG_SACK_RENEGING 0x2000
117#define FLAG_UPDATE_TS_RECENT 0x4000
118
119#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
120#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
121#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
122#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
123
124#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
125#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
126
127#define REXMIT_NONE 0
128#define REXMIT_LOST 1
129#define REXMIT_NEW 2
130
131
132
133
134static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
135{
136 struct inet_connection_sock *icsk = inet_csk(sk);
137 const unsigned int lss = icsk->icsk_ack.last_seg_size;
138 unsigned int len;
139
140 icsk->icsk_ack.last_seg_size = 0;
141
142
143
144
145 len = skb_shinfo(skb)->gso_size ? : skb->len;
146 if (len >= icsk->icsk_ack.rcv_mss) {
147 icsk->icsk_ack.rcv_mss = len;
148 } else {
149
150
151
152
153
154 len += skb->data - skb_transport_header(skb);
155 if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
156
157
158
159
160
161 (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
162 !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
163
164
165
166
167 len -= tcp_sk(sk)->tcp_header_len;
168 icsk->icsk_ack.last_seg_size = len;
169 if (len == lss) {
170 icsk->icsk_ack.rcv_mss = len;
171 return;
172 }
173 }
174 if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
175 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
176 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
177 }
178}
179
180static void tcp_incr_quickack(struct sock *sk)
181{
182 struct inet_connection_sock *icsk = inet_csk(sk);
183 unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
184
185 if (quickacks == 0)
186 quickacks = 2;
187 if (quickacks > icsk->icsk_ack.quick)
188 icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
189}
190
191static void tcp_enter_quickack_mode(struct sock *sk)
192{
193 struct inet_connection_sock *icsk = inet_csk(sk);
194 tcp_incr_quickack(sk);
195 icsk->icsk_ack.pingpong = 0;
196 icsk->icsk_ack.ato = TCP_ATO_MIN;
197}
198
199
200
201
202
203static bool tcp_in_quickack_mode(struct sock *sk)
204{
205 const struct inet_connection_sock *icsk = inet_csk(sk);
206 const struct dst_entry *dst = __sk_dst_get(sk);
207
208 return (dst && dst_metric(dst, RTAX_QUICKACK)) ||
209 (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);
210}
211
212static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
213{
214 if (tp->ecn_flags & TCP_ECN_OK)
215 tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
216}
217
218static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
219{
220 if (tcp_hdr(skb)->cwr)
221 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
222}
223
224static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
225{
226 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
227}
228
229static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
230{
231 switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
232 case INET_ECN_NOT_ECT:
233
234
235
236
237 if (tp->ecn_flags & TCP_ECN_SEEN)
238 tcp_enter_quickack_mode((struct sock *)tp);
239 break;
240 case INET_ECN_CE:
241 if (tcp_ca_needs_ecn((struct sock *)tp))
242 tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
243
244 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
245
246 tcp_enter_quickack_mode((struct sock *)tp);
247 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
248 }
249 tp->ecn_flags |= TCP_ECN_SEEN;
250 break;
251 default:
252 if (tcp_ca_needs_ecn((struct sock *)tp))
253 tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
254 tp->ecn_flags |= TCP_ECN_SEEN;
255 break;
256 }
257}
258
259static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
260{
261 if (tp->ecn_flags & TCP_ECN_OK)
262 __tcp_ecn_check_ce(tp, skb);
263}
264
265static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
266{
267 if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
268 tp->ecn_flags &= ~TCP_ECN_OK;
269}
270
271static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
272{
273 if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
274 tp->ecn_flags &= ~TCP_ECN_OK;
275}
276
277static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
278{
279 if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
280 return true;
281 return false;
282}
283
284
285
286
287
288
289static void tcp_sndbuf_expand(struct sock *sk)
290{
291 const struct tcp_sock *tp = tcp_sk(sk);
292 int sndmem, per_mss;
293 u32 nr_segs;
294
295
296
297
298 per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
299 MAX_TCP_HEADER +
300 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
301
302 per_mss = roundup_pow_of_two(per_mss) +
303 SKB_DATA_ALIGN(sizeof(struct sk_buff));
304
305 nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
306 nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
307
308
309
310
311
312 sndmem = 2 * nr_segs * per_mss;
313
314 if (sk->sk_sndbuf < sndmem)
315 sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
316}
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
345{
346 struct tcp_sock *tp = tcp_sk(sk);
347
348 int truesize = tcp_win_from_space(skb->truesize) >> 1;
349 int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1;
350
351 while (tp->rcv_ssthresh <= window) {
352 if (truesize <= skb->len)
353 return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
354
355 truesize >>= 1;
356 window >>= 1;
357 }
358 return 0;
359}
360
361static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
362{
363 struct tcp_sock *tp = tcp_sk(sk);
364
365
366 if (tp->rcv_ssthresh < tp->window_clamp &&
367 (int)tp->rcv_ssthresh < tcp_space(sk) &&
368 !tcp_under_memory_pressure(sk)) {
369 int incr;
370
371
372
373
374 if (tcp_win_from_space(skb->truesize) <= skb->len)
375 incr = 2 * tp->advmss;
376 else
377 incr = __tcp_grow_window(sk, skb);
378
379 if (incr) {
380 incr = max_t(int, incr, 2 * skb->len);
381 tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
382 tp->window_clamp);
383 inet_csk(sk)->icsk_ack.quick |= 1;
384 }
385 }
386}
387
388
389static void tcp_fixup_rcvbuf(struct sock *sk)
390{
391 u32 mss = tcp_sk(sk)->advmss;
392 int rcvmem;
393
394 rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
395 tcp_default_init_rwnd(mss);
396
397
398
399
400 if (sysctl_tcp_moderate_rcvbuf)
401 rcvmem <<= 2;
402
403 if (sk->sk_rcvbuf < rcvmem)
404 sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
405}
406
407
408
409
410void tcp_init_buffer_space(struct sock *sk)
411{
412 struct tcp_sock *tp = tcp_sk(sk);
413 int maxwin;
414
415 if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
416 tcp_fixup_rcvbuf(sk);
417 if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
418 tcp_sndbuf_expand(sk);
419
420 tp->rcvq_space.space = tp->rcv_wnd;
421 tp->rcvq_space.time = tcp_time_stamp;
422 tp->rcvq_space.seq = tp->copied_seq;
423
424 maxwin = tcp_full_space(sk);
425
426 if (tp->window_clamp >= maxwin) {
427 tp->window_clamp = maxwin;
428
429 if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss)
430 tp->window_clamp = max(maxwin -
431 (maxwin >> sysctl_tcp_app_win),
432 4 * tp->advmss);
433 }
434
435
436 if (sysctl_tcp_app_win &&
437 tp->window_clamp > 2 * tp->advmss &&
438 tp->window_clamp + tp->advmss > maxwin)
439 tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
440
441 tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
442 tp->snd_cwnd_stamp = tcp_time_stamp;
443}
444
445
446static void tcp_clamp_window(struct sock *sk)
447{
448 struct tcp_sock *tp = tcp_sk(sk);
449 struct inet_connection_sock *icsk = inet_csk(sk);
450
451 icsk->icsk_ack.quick = 0;
452
453 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
454 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
455 !tcp_under_memory_pressure(sk) &&
456 sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
457 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
458 sysctl_tcp_rmem[2]);
459 }
460 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
461 tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
462}
463
464
465
466
467
468
469
470
471void tcp_initialize_rcv_mss(struct sock *sk)
472{
473 const struct tcp_sock *tp = tcp_sk(sk);
474 unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
475
476 hint = min(hint, tp->rcv_wnd / 2);
477 hint = min(hint, TCP_MSS_DEFAULT);
478 hint = max(hint, TCP_MIN_MSS);
479
480 inet_csk(sk)->icsk_ack.rcv_mss = hint;
481}
482EXPORT_SYMBOL(tcp_initialize_rcv_mss);
483
484
485
486
487
488
489
490
491
492
493
494
495static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
496{
497 u32 new_sample = tp->rcv_rtt_est.rtt;
498 long m = sample;
499
500 if (m == 0)
501 m = 1;
502
503 if (new_sample != 0) {
504
505
506
507
508
509
510
511
512
513
514 if (!win_dep) {
515 m -= (new_sample >> 3);
516 new_sample += m;
517 } else {
518 m <<= 3;
519 if (m < new_sample)
520 new_sample = m;
521 }
522 } else {
523
524 new_sample = m << 3;
525 }
526
527 if (tp->rcv_rtt_est.rtt != new_sample)
528 tp->rcv_rtt_est.rtt = new_sample;
529}
530
531static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
532{
533 if (tp->rcv_rtt_est.time == 0)
534 goto new_measure;
535 if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
536 return;
537 tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1);
538
539new_measure:
540 tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
541 tp->rcv_rtt_est.time = tcp_time_stamp;
542}
543
544static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
545 const struct sk_buff *skb)
546{
547 struct tcp_sock *tp = tcp_sk(sk);
548 if (tp->rx_opt.rcv_tsecr &&
549 (TCP_SKB_CB(skb)->end_seq -
550 TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
551 tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
552}
553
554
555
556
557
558void tcp_rcv_space_adjust(struct sock *sk)
559{
560 struct tcp_sock *tp = tcp_sk(sk);
561 int time;
562 int copied;
563
564 time = tcp_time_stamp - tp->rcvq_space.time;
565 if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
566 return;
567
568
569 copied = tp->copied_seq - tp->rcvq_space.seq;
570 if (copied <= tp->rcvq_space.space)
571 goto new_measure;
572
573
574
575
576
577
578
579
580
581
582 if (sysctl_tcp_moderate_rcvbuf &&
583 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
584 int rcvwin, rcvmem, rcvbuf;
585
586
587
588
589 rcvwin = (copied << 1) + 16 * tp->advmss;
590
591
592
593
594
595
596 if (copied >=
597 tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
598 if (copied >=
599 tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
600 rcvwin <<= 1;
601 else
602 rcvwin += (rcvwin >> 1);
603 }
604
605 rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
606 while (tcp_win_from_space(rcvmem) < tp->advmss)
607 rcvmem += 128;
608
609 rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
610 if (rcvbuf > sk->sk_rcvbuf) {
611 sk->sk_rcvbuf = rcvbuf;
612
613
614 tp->window_clamp = rcvwin;
615 }
616 }
617 tp->rcvq_space.space = copied;
618
619new_measure:
620 tp->rcvq_space.seq = tp->copied_seq;
621 tp->rcvq_space.time = tcp_time_stamp;
622}
623
624
625
626
627
628
629
630
631
632
633
634static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
635{
636 struct tcp_sock *tp = tcp_sk(sk);
637 struct inet_connection_sock *icsk = inet_csk(sk);
638 u32 now;
639
640 inet_csk_schedule_ack(sk);
641
642 tcp_measure_rcv_mss(sk, skb);
643
644 tcp_rcv_rtt_measure(tp);
645
646 now = tcp_time_stamp;
647
648 if (!icsk->icsk_ack.ato) {
649
650
651
652 tcp_incr_quickack(sk);
653 icsk->icsk_ack.ato = TCP_ATO_MIN;
654 } else {
655 int m = now - icsk->icsk_ack.lrcvtime;
656
657 if (m <= TCP_ATO_MIN / 2) {
658
659 icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
660 } else if (m < icsk->icsk_ack.ato) {
661 icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
662 if (icsk->icsk_ack.ato > icsk->icsk_rto)
663 icsk->icsk_ack.ato = icsk->icsk_rto;
664 } else if (m > icsk->icsk_rto) {
665
666
667
668 tcp_incr_quickack(sk);
669 sk_mem_reclaim(sk);
670 }
671 }
672 icsk->icsk_ack.lrcvtime = now;
673
674 tcp_ecn_check_ce(tp, skb);
675
676 if (skb->len >= 128)
677 tcp_grow_window(sk, skb);
678}
679
680
681
682
683
684
685
686
687
688
689static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
690{
691 struct tcp_sock *tp = tcp_sk(sk);
692 long m = mrtt_us;
693 u32 srtt = tp->srtt_us;
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711 if (srtt != 0) {
712 m -= (srtt >> 3);
713 srtt += m;
714 if (m < 0) {
715 m = -m;
716 m -= (tp->mdev_us >> 2);
717
718
719
720
721
722
723
724
725 if (m > 0)
726 m >>= 3;
727 } else {
728 m -= (tp->mdev_us >> 2);
729 }
730 tp->mdev_us += m;
731 if (tp->mdev_us > tp->mdev_max_us) {
732 tp->mdev_max_us = tp->mdev_us;
733 if (tp->mdev_max_us > tp->rttvar_us)
734 tp->rttvar_us = tp->mdev_max_us;
735 }
736 if (after(tp->snd_una, tp->rtt_seq)) {
737 if (tp->mdev_max_us < tp->rttvar_us)
738 tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
739 tp->rtt_seq = tp->snd_nxt;
740 tp->mdev_max_us = tcp_rto_min_us(sk);
741 }
742 } else {
743
744 srtt = m << 3;
745 tp->mdev_us = m << 1;
746 tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
747 tp->mdev_max_us = tp->rttvar_us;
748 tp->rtt_seq = tp->snd_nxt;
749 }
750 tp->srtt_us = max(1U, srtt);
751}
752
753
754
755
756
757
758
759int sysctl_tcp_pacing_ss_ratio __read_mostly = 200;
760int sysctl_tcp_pacing_ca_ratio __read_mostly = 120;
761
762static void tcp_update_pacing_rate(struct sock *sk)
763{
764 const struct tcp_sock *tp = tcp_sk(sk);
765 u64 rate;
766
767
768 rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
769
770
771
772
773
774
775
776
777
778 if (tp->snd_cwnd < tp->snd_ssthresh / 2)
779 rate *= sysctl_tcp_pacing_ss_ratio;
780 else
781 rate *= sysctl_tcp_pacing_ca_ratio;
782
783 rate *= max(tp->snd_cwnd, tp->packets_out);
784
785 if (likely(tp->srtt_us))
786 do_div(rate, tp->srtt_us);
787
788
789
790
791
792 ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate,
793 sk->sk_max_pacing_rate);
794}
795
796
797
798
799static void tcp_set_rto(struct sock *sk)
800{
801 const struct tcp_sock *tp = tcp_sk(sk);
802
803
804
805
806
807
808
809
810
811
812 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
813
814
815
816
817
818
819
820
821
822
823 tcp_bound_rto(sk);
824}
825
826__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
827{
828 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
829
830 if (!cwnd)
831 cwnd = TCP_INIT_CWND;
832 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
833}
834
835
836
837
838
839void tcp_disable_fack(struct tcp_sock *tp)
840{
841
842 if (tcp_is_fack(tp))
843 tp->lost_skb_hint = NULL;
844 tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED;
845}
846
847
848static void tcp_dsack_seen(struct tcp_sock *tp)
849{
850 tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
851}
852
853static void tcp_update_reordering(struct sock *sk, const int metric,
854 const int ts)
855{
856 struct tcp_sock *tp = tcp_sk(sk);
857 if (metric > tp->reordering) {
858 int mib_idx;
859
860 tp->reordering = min(sysctl_tcp_max_reordering, metric);
861
862
863 if (ts)
864 mib_idx = LINUX_MIB_TCPTSREORDER;
865 else if (tcp_is_reno(tp))
866 mib_idx = LINUX_MIB_TCPRENOREORDER;
867 else if (tcp_is_fack(tp))
868 mib_idx = LINUX_MIB_TCPFACKREORDER;
869 else
870 mib_idx = LINUX_MIB_TCPSACKREORDER;
871
872 NET_INC_STATS_BH(sock_net(sk), mib_idx);
873#if FASTRETRANS_DEBUG > 1
874 pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
875 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
876 tp->reordering,
877 tp->fackets_out,
878 tp->sacked_out,
879 tp->undo_marker ? tp->undo_retrans : 0);
880#endif
881 tcp_disable_fack(tp);
882 }
883
884 if (metric > 0)
885 tcp_disable_early_retrans(tp);
886 tp->rack.reord = 1;
887}
888
889
890static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
891{
892 if (!tp->retransmit_skb_hint ||
893 before(TCP_SKB_CB(skb)->seq,
894 TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
895 tp->retransmit_skb_hint = skb;
896
897 if (!tp->lost_out ||
898 after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
899 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
900}
901
902static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
903{
904 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
905 tcp_verify_retransmit_hint(tp, skb);
906
907 tp->lost_out += tcp_skb_pcount(skb);
908 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
909 }
910}
911
912void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
913{
914 tcp_verify_retransmit_hint(tp, skb);
915
916 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
917 tp->lost_out += tcp_skb_pcount(skb);
918 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
919 }
920}
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
1017 u32 start_seq, u32 end_seq)
1018{
1019
1020 if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
1021 return false;
1022
1023
1024 if (!before(start_seq, tp->snd_nxt))
1025 return false;
1026
1027
1028
1029
1030 if (after(start_seq, tp->snd_una))
1031 return true;
1032
1033 if (!is_dsack || !tp->undo_marker)
1034 return false;
1035
1036
1037 if (after(end_seq, tp->snd_una))
1038 return false;
1039
1040 if (!before(start_seq, tp->undo_marker))
1041 return true;
1042
1043
1044 if (!after(end_seq, tp->undo_marker))
1045 return false;
1046
1047
1048
1049
1050 return !before(start_seq, end_seq - tp->max_window);
1051}
1052
1053static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
1054 struct tcp_sack_block_wire *sp, int num_sacks,
1055 u32 prior_snd_una)
1056{
1057 struct tcp_sock *tp = tcp_sk(sk);
1058 u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
1059 u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
1060 bool dup_sack = false;
1061
1062 if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
1063 dup_sack = true;
1064 tcp_dsack_seen(tp);
1065 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
1066 } else if (num_sacks > 1) {
1067 u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
1068 u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
1069
1070 if (!after(end_seq_0, end_seq_1) &&
1071 !before(start_seq_0, start_seq_1)) {
1072 dup_sack = true;
1073 tcp_dsack_seen(tp);
1074 NET_INC_STATS_BH(sock_net(sk),
1075 LINUX_MIB_TCPDSACKOFORECV);
1076 }
1077 }
1078
1079
1080 if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
1081 !after(end_seq_0, prior_snd_una) &&
1082 after(end_seq_0, tp->undo_marker))
1083 tp->undo_retrans--;
1084
1085 return dup_sack;
1086}
1087
1088struct tcp_sacktag_state {
1089 int reord;
1090 int fack_count;
1091
1092
1093
1094
1095 struct skb_mstamp first_sackt;
1096 struct skb_mstamp last_sackt;
1097 int flag;
1098};
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1109 u32 start_seq, u32 end_seq)
1110{
1111 int err;
1112 bool in_sack;
1113 unsigned int pkt_len;
1114 unsigned int mss;
1115
1116 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1117 !before(end_seq, TCP_SKB_CB(skb)->end_seq);
1118
1119 if (tcp_skb_pcount(skb) > 1 && !in_sack &&
1120 after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
1121 mss = tcp_skb_mss(skb);
1122 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1123
1124 if (!in_sack) {
1125 pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
1126 if (pkt_len < mss)
1127 pkt_len = mss;
1128 } else {
1129 pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
1130 if (pkt_len < mss)
1131 return -EINVAL;
1132 }
1133
1134
1135
1136
1137 if (pkt_len > mss) {
1138 unsigned int new_len = (pkt_len / mss) * mss;
1139 if (!in_sack && new_len < pkt_len) {
1140 new_len += mss;
1141 if (new_len >= skb->len)
1142 return 0;
1143 }
1144 pkt_len = new_len;
1145 }
1146 err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
1147 if (err < 0)
1148 return err;
1149 }
1150
1151 return in_sack;
1152}
1153
1154
1155static u8 tcp_sacktag_one(struct sock *sk,
1156 struct tcp_sacktag_state *state, u8 sacked,
1157 u32 start_seq, u32 end_seq,
1158 int dup_sack, int pcount,
1159 const struct skb_mstamp *xmit_time)
1160{
1161 struct tcp_sock *tp = tcp_sk(sk);
1162 int fack_count = state->fack_count;
1163
1164
1165 if (dup_sack && (sacked & TCPCB_RETRANS)) {
1166 if (tp->undo_marker && tp->undo_retrans > 0 &&
1167 after(end_seq, tp->undo_marker))
1168 tp->undo_retrans--;
1169 if (sacked & TCPCB_SACKED_ACKED)
1170 state->reord = min(fack_count, state->reord);
1171 }
1172
1173
1174 if (!after(end_seq, tp->snd_una))
1175 return sacked;
1176
1177 if (!(sacked & TCPCB_SACKED_ACKED)) {
1178 tcp_rack_advance(tp, xmit_time, sacked);
1179
1180 if (sacked & TCPCB_SACKED_RETRANS) {
1181
1182
1183
1184
1185 if (sacked & TCPCB_LOST) {
1186 sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
1187 tp->lost_out -= pcount;
1188 tp->retrans_out -= pcount;
1189 }
1190 } else {
1191 if (!(sacked & TCPCB_RETRANS)) {
1192
1193
1194
1195 if (before(start_seq,
1196 tcp_highest_sack_seq(tp)))
1197 state->reord = min(fack_count,
1198 state->reord);
1199 if (!after(end_seq, tp->high_seq))
1200 state->flag |= FLAG_ORIG_SACK_ACKED;
1201 if (state->first_sackt.v64 == 0)
1202 state->first_sackt = *xmit_time;
1203 state->last_sackt = *xmit_time;
1204 }
1205
1206 if (sacked & TCPCB_LOST) {
1207 sacked &= ~TCPCB_LOST;
1208 tp->lost_out -= pcount;
1209 }
1210 }
1211
1212 sacked |= TCPCB_SACKED_ACKED;
1213 state->flag |= FLAG_DATA_SACKED;
1214 tp->sacked_out += pcount;
1215 tp->delivered += pcount;
1216
1217 fack_count += pcount;
1218
1219
1220 if (!tcp_is_fack(tp) && tp->lost_skb_hint &&
1221 before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
1222 tp->lost_cnt_hint += pcount;
1223
1224 if (fack_count > tp->fackets_out)
1225 tp->fackets_out = fack_count;
1226 }
1227
1228
1229
1230
1231
1232 if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
1233 sacked &= ~TCPCB_SACKED_RETRANS;
1234 tp->retrans_out -= pcount;
1235 }
1236
1237 return sacked;
1238}
1239
1240
1241
1242
1243static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1244 struct tcp_sacktag_state *state,
1245 unsigned int pcount, int shifted, int mss,
1246 bool dup_sack)
1247{
1248 struct tcp_sock *tp = tcp_sk(sk);
1249 struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
1250 u32 start_seq = TCP_SKB_CB(skb)->seq;
1251 u32 end_seq = start_seq + shifted;
1252
1253 BUG_ON(!pcount);
1254
1255
1256
1257
1258
1259
1260
1261 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
1262 start_seq, end_seq, dup_sack, pcount,
1263 &skb->skb_mstamp);
1264
1265 if (skb == tp->lost_skb_hint)
1266 tp->lost_cnt_hint += pcount;
1267
1268 TCP_SKB_CB(prev)->end_seq += shifted;
1269 TCP_SKB_CB(skb)->seq += shifted;
1270
1271 tcp_skb_pcount_add(prev, pcount);
1272 BUG_ON(tcp_skb_pcount(skb) < pcount);
1273 tcp_skb_pcount_add(skb, -pcount);
1274
1275
1276
1277
1278
1279
1280 if (!TCP_SKB_CB(prev)->tcp_gso_size)
1281 TCP_SKB_CB(prev)->tcp_gso_size = mss;
1282
1283
1284 if (tcp_skb_pcount(skb) <= 1)
1285 TCP_SKB_CB(skb)->tcp_gso_size = 0;
1286
1287
1288 TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
1289
1290 if (skb->len > 0) {
1291 BUG_ON(!tcp_skb_pcount(skb));
1292 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED);
1293 return false;
1294 }
1295
1296
1297
1298 if (skb == tp->retransmit_skb_hint)
1299 tp->retransmit_skb_hint = prev;
1300 if (skb == tp->lost_skb_hint) {
1301 tp->lost_skb_hint = prev;
1302 tp->lost_cnt_hint -= tcp_skb_pcount(prev);
1303 }
1304
1305 TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1306 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1307 TCP_SKB_CB(prev)->end_seq++;
1308
1309 if (skb == tcp_highest_sack(sk))
1310 tcp_advance_highest_sack(sk, skb);
1311
1312 tcp_skb_collapse_tstamp(prev, skb);
1313 tcp_unlink_write_queue(skb, sk);
1314 sk_wmem_free_skb(sk, skb);
1315
1316 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED);
1317
1318 return true;
1319}
1320
1321
1322
1323
1324static int tcp_skb_seglen(const struct sk_buff *skb)
1325{
1326 return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
1327}
1328
1329
1330static int skb_can_shift(const struct sk_buff *skb)
1331{
1332 return !skb_headlen(skb) && skb_is_nonlinear(skb);
1333}
1334
1335
1336
1337
1338static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1339 struct tcp_sacktag_state *state,
1340 u32 start_seq, u32 end_seq,
1341 bool dup_sack)
1342{
1343 struct tcp_sock *tp = tcp_sk(sk);
1344 struct sk_buff *prev;
1345 int mss;
1346 int pcount = 0;
1347 int len;
1348 int in_sack;
1349
1350 if (!sk_can_gso(sk))
1351 goto fallback;
1352
1353
1354 if (!dup_sack &&
1355 (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
1356 goto fallback;
1357 if (!skb_can_shift(skb))
1358 goto fallback;
1359
1360 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1361 goto fallback;
1362
1363
1364 if (unlikely(skb == tcp_write_queue_head(sk)))
1365 goto fallback;
1366 prev = tcp_write_queue_prev(sk, skb);
1367
1368 if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
1369 goto fallback;
1370
1371 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1372 !before(end_seq, TCP_SKB_CB(skb)->end_seq);
1373
1374 if (in_sack) {
1375 len = skb->len;
1376 pcount = tcp_skb_pcount(skb);
1377 mss = tcp_skb_seglen(skb);
1378
1379
1380
1381
1382 if (mss != tcp_skb_seglen(prev))
1383 goto fallback;
1384 } else {
1385 if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
1386 goto noop;
1387
1388
1389
1390
1391 if (tcp_skb_pcount(skb) <= 1)
1392 goto noop;
1393
1394 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1395 if (!in_sack) {
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407 goto fallback;
1408 }
1409
1410 len = end_seq - TCP_SKB_CB(skb)->seq;
1411 BUG_ON(len < 0);
1412 BUG_ON(len > skb->len);
1413
1414
1415
1416
1417
1418 mss = tcp_skb_mss(skb);
1419
1420
1421
1422
1423 if (mss != tcp_skb_seglen(prev))
1424 goto fallback;
1425
1426 if (len == mss) {
1427 pcount = 1;
1428 } else if (len < mss) {
1429 goto noop;
1430 } else {
1431 pcount = len / mss;
1432 len = pcount * mss;
1433 }
1434 }
1435
1436
1437 if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
1438 goto fallback;
1439
1440 if (!skb_shift(prev, skb, len))
1441 goto fallback;
1442 if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
1443 goto out;
1444
1445
1446
1447
1448 if (prev == tcp_write_queue_tail(sk))
1449 goto out;
1450 skb = tcp_write_queue_next(sk, prev);
1451
1452 if (!skb_can_shift(skb) ||
1453 (skb == tcp_send_head(sk)) ||
1454 ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
1455 (mss != tcp_skb_seglen(skb)))
1456 goto out;
1457
1458 len = skb->len;
1459 if (skb_shift(prev, skb, len)) {
1460 pcount += tcp_skb_pcount(skb);
1461 tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0);
1462 }
1463
1464out:
1465 state->fack_count += pcount;
1466 return prev;
1467
1468noop:
1469 return skb;
1470
1471fallback:
1472 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
1473 return NULL;
1474}
1475
1476static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1477 struct tcp_sack_block *next_dup,
1478 struct tcp_sacktag_state *state,
1479 u32 start_seq, u32 end_seq,
1480 bool dup_sack_in)
1481{
1482 struct tcp_sock *tp = tcp_sk(sk);
1483 struct sk_buff *tmp;
1484
1485 tcp_for_write_queue_from(skb, sk) {
1486 int in_sack = 0;
1487 bool dup_sack = dup_sack_in;
1488
1489 if (skb == tcp_send_head(sk))
1490 break;
1491
1492
1493 if (!before(TCP_SKB_CB(skb)->seq, end_seq))
1494 break;
1495
1496 if (next_dup &&
1497 before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
1498 in_sack = tcp_match_skb_to_sack(sk, skb,
1499 next_dup->start_seq,
1500 next_dup->end_seq);
1501 if (in_sack > 0)
1502 dup_sack = true;
1503 }
1504
1505
1506
1507
1508
1509 if (in_sack <= 0) {
1510 tmp = tcp_shift_skb_data(sk, skb, state,
1511 start_seq, end_seq, dup_sack);
1512 if (tmp) {
1513 if (tmp != skb) {
1514 skb = tmp;
1515 continue;
1516 }
1517
1518 in_sack = 0;
1519 } else {
1520 in_sack = tcp_match_skb_to_sack(sk, skb,
1521 start_seq,
1522 end_seq);
1523 }
1524 }
1525
1526 if (unlikely(in_sack < 0))
1527 break;
1528
1529 if (in_sack) {
1530 TCP_SKB_CB(skb)->sacked =
1531 tcp_sacktag_one(sk,
1532 state,
1533 TCP_SKB_CB(skb)->sacked,
1534 TCP_SKB_CB(skb)->seq,
1535 TCP_SKB_CB(skb)->end_seq,
1536 dup_sack,
1537 tcp_skb_pcount(skb),
1538 &skb->skb_mstamp);
1539
1540 if (!before(TCP_SKB_CB(skb)->seq,
1541 tcp_highest_sack_seq(tp)))
1542 tcp_advance_highest_sack(sk, skb);
1543 }
1544
1545 state->fack_count += tcp_skb_pcount(skb);
1546 }
1547 return skb;
1548}
1549
1550
1551
1552
1553static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
1554 struct tcp_sacktag_state *state,
1555 u32 skip_to_seq)
1556{
1557 tcp_for_write_queue_from(skb, sk) {
1558 if (skb == tcp_send_head(sk))
1559 break;
1560
1561 if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
1562 break;
1563
1564 state->fack_count += tcp_skb_pcount(skb);
1565 }
1566 return skb;
1567}
1568
1569static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
1570 struct sock *sk,
1571 struct tcp_sack_block *next_dup,
1572 struct tcp_sacktag_state *state,
1573 u32 skip_to_seq)
1574{
1575 if (!next_dup)
1576 return skb;
1577
1578 if (before(next_dup->start_seq, skip_to_seq)) {
1579 skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq);
1580 skb = tcp_sacktag_walk(skb, sk, NULL, state,
1581 next_dup->start_seq, next_dup->end_seq,
1582 1);
1583 }
1584
1585 return skb;
1586}
1587
1588static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache)
1589{
1590 return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
1591}
1592
1593static int
1594tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1595 u32 prior_snd_una, struct tcp_sacktag_state *state)
1596{
1597 struct tcp_sock *tp = tcp_sk(sk);
1598 const unsigned char *ptr = (skb_transport_header(ack_skb) +
1599 TCP_SKB_CB(ack_skb)->sacked);
1600 struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
1601 struct tcp_sack_block sp[TCP_NUM_SACKS];
1602 struct tcp_sack_block *cache;
1603 struct sk_buff *skb;
1604 int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
1605 int used_sacks;
1606 bool found_dup_sack = false;
1607 int i, j;
1608 int first_sack_index;
1609
1610 state->flag = 0;
1611 state->reord = tp->packets_out;
1612
1613 if (!tp->sacked_out) {
1614 if (WARN_ON(tp->fackets_out))
1615 tp->fackets_out = 0;
1616 tcp_highest_sack_reset(sk);
1617 }
1618
1619 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
1620 num_sacks, prior_snd_una);
1621 if (found_dup_sack)
1622 state->flag |= FLAG_DSACKING_ACK;
1623
1624
1625
1626
1627
1628 if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
1629 return 0;
1630
1631 if (!tp->packets_out)
1632 goto out;
1633
1634 used_sacks = 0;
1635 first_sack_index = 0;
1636 for (i = 0; i < num_sacks; i++) {
1637 bool dup_sack = !i && found_dup_sack;
1638
1639 sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
1640 sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
1641
1642 if (!tcp_is_sackblock_valid(tp, dup_sack,
1643 sp[used_sacks].start_seq,
1644 sp[used_sacks].end_seq)) {
1645 int mib_idx;
1646
1647 if (dup_sack) {
1648 if (!tp->undo_marker)
1649 mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
1650 else
1651 mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
1652 } else {
1653
1654 if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
1655 !after(sp[used_sacks].end_seq, tp->snd_una))
1656 continue;
1657 mib_idx = LINUX_MIB_TCPSACKDISCARD;
1658 }
1659
1660 NET_INC_STATS_BH(sock_net(sk), mib_idx);
1661 if (i == 0)
1662 first_sack_index = -1;
1663 continue;
1664 }
1665
1666
1667 if (!after(sp[used_sacks].end_seq, prior_snd_una))
1668 continue;
1669
1670 used_sacks++;
1671 }
1672
1673
1674 for (i = used_sacks - 1; i > 0; i--) {
1675 for (j = 0; j < i; j++) {
1676 if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
1677 swap(sp[j], sp[j + 1]);
1678
1679
1680 if (j == first_sack_index)
1681 first_sack_index = j + 1;
1682 }
1683 }
1684 }
1685
1686 skb = tcp_write_queue_head(sk);
1687 state->fack_count = 0;
1688 i = 0;
1689
1690 if (!tp->sacked_out) {
1691
1692 cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
1693 } else {
1694 cache = tp->recv_sack_cache;
1695
1696 while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
1697 !cache->end_seq)
1698 cache++;
1699 }
1700
1701 while (i < used_sacks) {
1702 u32 start_seq = sp[i].start_seq;
1703 u32 end_seq = sp[i].end_seq;
1704 bool dup_sack = (found_dup_sack && (i == first_sack_index));
1705 struct tcp_sack_block *next_dup = NULL;
1706
1707 if (found_dup_sack && ((i + 1) == first_sack_index))
1708 next_dup = &sp[i + 1];
1709
1710
1711 while (tcp_sack_cache_ok(tp, cache) &&
1712 !before(start_seq, cache->end_seq))
1713 cache++;
1714
1715
1716 if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
1717 after(end_seq, cache->start_seq)) {
1718
1719
1720 if (before(start_seq, cache->start_seq)) {
1721 skb = tcp_sacktag_skip(skb, sk, state,
1722 start_seq);
1723 skb = tcp_sacktag_walk(skb, sk, next_dup,
1724 state,
1725 start_seq,
1726 cache->start_seq,
1727 dup_sack);
1728 }
1729
1730
1731 if (!after(end_seq, cache->end_seq))
1732 goto advance_sp;
1733
1734 skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
1735 state,
1736 cache->end_seq);
1737
1738
1739 if (tcp_highest_sack_seq(tp) == cache->end_seq) {
1740
1741 skb = tcp_highest_sack(sk);
1742 if (!skb)
1743 break;
1744 state->fack_count = tp->fackets_out;
1745 cache++;
1746 goto walk;
1747 }
1748
1749 skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq);
1750
1751 cache++;
1752 continue;
1753 }
1754
1755 if (!before(start_seq, tcp_highest_sack_seq(tp))) {
1756 skb = tcp_highest_sack(sk);
1757 if (!skb)
1758 break;
1759 state->fack_count = tp->fackets_out;
1760 }
1761 skb = tcp_sacktag_skip(skb, sk, state, start_seq);
1762
1763walk:
1764 skb = tcp_sacktag_walk(skb, sk, next_dup, state,
1765 start_seq, end_seq, dup_sack);
1766
1767advance_sp:
1768 i++;
1769 }
1770
1771
1772 for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
1773 tp->recv_sack_cache[i].start_seq = 0;
1774 tp->recv_sack_cache[i].end_seq = 0;
1775 }
1776 for (j = 0; j < used_sacks; j++)
1777 tp->recv_sack_cache[i++] = sp[j];
1778
1779 if ((state->reord < tp->fackets_out) &&
1780 ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
1781 tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
1782
1783 tcp_verify_left_out(tp);
1784out:
1785
1786#if FASTRETRANS_DEBUG > 0
1787 WARN_ON((int)tp->sacked_out < 0);
1788 WARN_ON((int)tp->lost_out < 0);
1789 WARN_ON((int)tp->retrans_out < 0);
1790 WARN_ON((int)tcp_packets_in_flight(tp) < 0);
1791#endif
1792 return state->flag;
1793}
1794
1795
1796
1797
1798static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
1799{
1800 u32 holes;
1801
1802 holes = max(tp->lost_out, 1U);
1803 holes = min(holes, tp->packets_out);
1804
1805 if ((tp->sacked_out + holes) > tp->packets_out) {
1806 tp->sacked_out = tp->packets_out - holes;
1807 return true;
1808 }
1809 return false;
1810}
1811
1812
1813
1814
1815
1816static void tcp_check_reno_reordering(struct sock *sk, const int addend)
1817{
1818 struct tcp_sock *tp = tcp_sk(sk);
1819 if (tcp_limit_reno_sacked(tp))
1820 tcp_update_reordering(sk, tp->packets_out + addend, 0);
1821}
1822
1823
1824
1825static void tcp_add_reno_sack(struct sock *sk)
1826{
1827 struct tcp_sock *tp = tcp_sk(sk);
1828 u32 prior_sacked = tp->sacked_out;
1829
1830 tp->sacked_out++;
1831 tcp_check_reno_reordering(sk, 0);
1832 if (tp->sacked_out > prior_sacked)
1833 tp->delivered++;
1834 tcp_verify_left_out(tp);
1835}
1836
1837
1838
1839static void tcp_remove_reno_sacks(struct sock *sk, int acked)
1840{
1841 struct tcp_sock *tp = tcp_sk(sk);
1842
1843 if (acked > 0) {
1844
1845 tp->delivered += max_t(int, acked - tp->sacked_out, 1);
1846 if (acked - 1 >= tp->sacked_out)
1847 tp->sacked_out = 0;
1848 else
1849 tp->sacked_out -= acked - 1;
1850 }
1851 tcp_check_reno_reordering(sk, acked);
1852 tcp_verify_left_out(tp);
1853}
1854
1855static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
1856{
1857 tp->sacked_out = 0;
1858}
1859
1860void tcp_clear_retrans(struct tcp_sock *tp)
1861{
1862 tp->retrans_out = 0;
1863 tp->lost_out = 0;
1864 tp->undo_marker = 0;
1865 tp->undo_retrans = -1;
1866 tp->fackets_out = 0;
1867 tp->sacked_out = 0;
1868}
1869
1870static inline void tcp_init_undo(struct tcp_sock *tp)
1871{
1872 tp->undo_marker = tp->snd_una;
1873
1874 tp->undo_retrans = tp->retrans_out ? : -1;
1875}
1876
1877
1878
1879
1880
1881void tcp_enter_loss(struct sock *sk)
1882{
1883 const struct inet_connection_sock *icsk = inet_csk(sk);
1884 struct tcp_sock *tp = tcp_sk(sk);
1885 struct net *net = sock_net(sk);
1886 struct sk_buff *skb;
1887 bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
1888 bool is_reneg;
1889
1890
1891 if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
1892 !after(tp->high_seq, tp->snd_una) ||
1893 (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
1894 tp->prior_ssthresh = tcp_current_ssthresh(sk);
1895 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1896 tcp_ca_event(sk, CA_EVENT_LOSS);
1897 tcp_init_undo(tp);
1898 }
1899 tp->snd_cwnd = 1;
1900 tp->snd_cwnd_cnt = 0;
1901 tp->snd_cwnd_stamp = tcp_time_stamp;
1902
1903 tp->retrans_out = 0;
1904 tp->lost_out = 0;
1905
1906 if (tcp_is_reno(tp))
1907 tcp_reset_reno_sack(tp);
1908
1909 skb = tcp_write_queue_head(sk);
1910 is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
1911 if (is_reneg) {
1912 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
1913 tp->sacked_out = 0;
1914 tp->fackets_out = 0;
1915 }
1916 tcp_clear_all_retrans_hints(tp);
1917
1918 tcp_for_write_queue(skb, sk) {
1919 if (skb == tcp_send_head(sk))
1920 break;
1921
1922 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
1923 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
1924 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
1925 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1926 tp->lost_out += tcp_skb_pcount(skb);
1927 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
1928 }
1929 }
1930 tcp_verify_left_out(tp);
1931
1932
1933
1934
1935 if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
1936 tp->sacked_out >= net->ipv4.sysctl_tcp_reordering)
1937 tp->reordering = min_t(unsigned int, tp->reordering,
1938 net->ipv4.sysctl_tcp_reordering);
1939 tcp_set_ca_state(sk, TCP_CA_Loss);
1940 tp->high_seq = tp->snd_nxt;
1941 tcp_ecn_queue_cwr(tp);
1942
1943
1944
1945
1946
1947 tp->frto = sysctl_tcp_frto &&
1948 (new_recovery || icsk->icsk_retransmits) &&
1949 !inet_csk(sk)->icsk_mtup.probe_size;
1950}
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962static bool tcp_check_sack_reneging(struct sock *sk, int flag)
1963{
1964 if (flag & FLAG_SACK_RENEGING) {
1965 struct tcp_sock *tp = tcp_sk(sk);
1966 unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
1967 msecs_to_jiffies(10));
1968
1969 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1970 delay, TCP_RTO_MAX);
1971 return true;
1972 }
1973 return false;
1974}
1975
1976static inline int tcp_fackets_out(const struct tcp_sock *tp)
1977{
1978 return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
1979}
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
1997{
1998 return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
1999}
2000
2001static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
2002{
2003 struct tcp_sock *tp = tcp_sk(sk);
2004 unsigned long delay;
2005
2006
2007
2008
2009
2010 if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
2011 (flag & FLAG_ECE) || !tp->srtt_us)
2012 return false;
2013
2014 delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
2015 msecs_to_jiffies(2));
2016
2017 if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
2018 return false;
2019
2020 inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
2021 TCP_RTO_MAX);
2022 return true;
2023}
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118static bool tcp_time_to_recover(struct sock *sk, int flag)
2119{
2120 struct tcp_sock *tp = tcp_sk(sk);
2121 __u32 packets_out;
2122 int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
2123
2124
2125 if (tp->lost_out)
2126 return true;
2127
2128
2129 if (tcp_dupack_heuristics(tp) > tp->reordering)
2130 return true;
2131
2132
2133
2134
2135 packets_out = tp->packets_out;
2136 if (packets_out <= tp->reordering &&
2137 tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) &&
2138 !tcp_may_send_now(sk)) {
2139
2140
2141
2142 return true;
2143 }
2144
2145
2146
2147
2148
2149
2150 if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
2151 tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
2152 tcp_is_sack(tp) && !tcp_send_head(sk))
2153 return true;
2154
2155
2156
2157
2158
2159
2160 if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
2161 (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
2162 !tcp_may_send_now(sk))
2163 return !tcp_pause_early_retransmit(sk, flag);
2164
2165 return false;
2166}
2167
2168
2169
2170
2171
2172
2173
2174static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2175{
2176 struct tcp_sock *tp = tcp_sk(sk);
2177 struct sk_buff *skb;
2178 int cnt, oldcnt, lost;
2179 unsigned int mss;
2180
2181 const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
2182
2183 WARN_ON(packets > tp->packets_out);
2184 if (tp->lost_skb_hint) {
2185 skb = tp->lost_skb_hint;
2186 cnt = tp->lost_cnt_hint;
2187
2188 if (mark_head && skb != tcp_write_queue_head(sk))
2189 return;
2190 } else {
2191 skb = tcp_write_queue_head(sk);
2192 cnt = 0;
2193 }
2194
2195 tcp_for_write_queue_from(skb, sk) {
2196 if (skb == tcp_send_head(sk))
2197 break;
2198
2199
2200 tp->lost_skb_hint = skb;
2201 tp->lost_cnt_hint = cnt;
2202
2203 if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
2204 break;
2205
2206 oldcnt = cnt;
2207 if (tcp_is_fack(tp) || tcp_is_reno(tp) ||
2208 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
2209 cnt += tcp_skb_pcount(skb);
2210
2211 if (cnt > packets) {
2212 if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
2213 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
2214 (oldcnt >= packets))
2215 break;
2216
2217 mss = tcp_skb_mss(skb);
2218
2219 lost = (packets - oldcnt) * mss;
2220 if (lost < skb->len &&
2221 tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0)
2222 break;
2223 cnt = packets;
2224 }
2225
2226 tcp_skb_mark_lost(tp, skb);
2227
2228 if (mark_head)
2229 break;
2230 }
2231 tcp_verify_left_out(tp);
2232}
2233
2234
2235
2236static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
2237{
2238 struct tcp_sock *tp = tcp_sk(sk);
2239
2240 if (tcp_is_reno(tp)) {
2241 tcp_mark_head_lost(sk, 1, 1);
2242 } else if (tcp_is_fack(tp)) {
2243 int lost = tp->fackets_out - tp->reordering;
2244 if (lost <= 0)
2245 lost = 1;
2246 tcp_mark_head_lost(sk, lost, 0);
2247 } else {
2248 int sacked_upto = tp->sacked_out - tp->reordering;
2249 if (sacked_upto >= 0)
2250 tcp_mark_head_lost(sk, sacked_upto, 0);
2251 else if (fast_rexmit)
2252 tcp_mark_head_lost(sk, 1, 1);
2253 }
2254}
2255
2256
2257
2258
2259static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
2260{
2261 tp->snd_cwnd = min(tp->snd_cwnd,
2262 tcp_packets_in_flight(tp) + tcp_max_burst(tp));
2263 tp->snd_cwnd_stamp = tcp_time_stamp;
2264}
2265
2266static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
2267{
2268 return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
2269 before(tp->rx_opt.rcv_tsecr, when);
2270}
2271
2272
2273
2274
2275static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
2276 const struct sk_buff *skb)
2277{
2278 return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
2279 tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
2280}
2281
2282
2283
2284
2285static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
2286{
2287 return !tp->retrans_stamp ||
2288 tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
2289}
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307static bool tcp_any_retrans_done(const struct sock *sk)
2308{
2309 const struct tcp_sock *tp = tcp_sk(sk);
2310 struct sk_buff *skb;
2311
2312 if (tp->retrans_out)
2313 return true;
2314
2315 skb = tcp_write_queue_head(sk);
2316 if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
2317 return true;
2318
2319 return false;
2320}
2321
2322#if FASTRETRANS_DEBUG > 1
2323static void DBGUNDO(struct sock *sk, const char *msg)
2324{
2325 struct tcp_sock *tp = tcp_sk(sk);
2326 struct inet_sock *inet = inet_sk(sk);
2327
2328 if (sk->sk_family == AF_INET) {
2329 pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
2330 msg,
2331 &inet->inet_daddr, ntohs(inet->inet_dport),
2332 tp->snd_cwnd, tcp_left_out(tp),
2333 tp->snd_ssthresh, tp->prior_ssthresh,
2334 tp->packets_out);
2335 }
2336#if IS_ENABLED(CONFIG_IPV6)
2337 else if (sk->sk_family == AF_INET6) {
2338 struct ipv6_pinfo *np = inet6_sk(sk);
2339 pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
2340 msg,
2341 &np->daddr, ntohs(inet->inet_dport),
2342 tp->snd_cwnd, tcp_left_out(tp),
2343 tp->snd_ssthresh, tp->prior_ssthresh,
2344 tp->packets_out);
2345 }
2346#endif
2347}
2348#else
2349#define DBGUNDO(x...) do { } while (0)
2350#endif
2351
2352static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
2353{
2354 struct tcp_sock *tp = tcp_sk(sk);
2355
2356 if (unmark_loss) {
2357 struct sk_buff *skb;
2358
2359 tcp_for_write_queue(skb, sk) {
2360 if (skb == tcp_send_head(sk))
2361 break;
2362 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
2363 }
2364 tp->lost_out = 0;
2365 tcp_clear_all_retrans_hints(tp);
2366 }
2367
2368 if (tp->prior_ssthresh) {
2369 const struct inet_connection_sock *icsk = inet_csk(sk);
2370
2371 if (icsk->icsk_ca_ops->undo_cwnd)
2372 tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
2373 else
2374 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
2375
2376 if (tp->prior_ssthresh > tp->snd_ssthresh) {
2377 tp->snd_ssthresh = tp->prior_ssthresh;
2378 tcp_ecn_withdraw_cwr(tp);
2379 }
2380 }
2381 tp->snd_cwnd_stamp = tcp_time_stamp;
2382 tp->undo_marker = 0;
2383}
2384
2385static inline bool tcp_may_undo(const struct tcp_sock *tp)
2386{
2387 return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
2388}
2389
2390
2391static bool tcp_try_undo_recovery(struct sock *sk)
2392{
2393 struct tcp_sock *tp = tcp_sk(sk);
2394
2395 if (tcp_may_undo(tp)) {
2396 int mib_idx;
2397
2398
2399
2400
2401 DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
2402 tcp_undo_cwnd_reduction(sk, false);
2403 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
2404 mib_idx = LINUX_MIB_TCPLOSSUNDO;
2405 else
2406 mib_idx = LINUX_MIB_TCPFULLUNDO;
2407
2408 NET_INC_STATS_BH(sock_net(sk), mib_idx);
2409 }
2410 if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
2411
2412
2413
2414 tcp_moderate_cwnd(tp);
2415 if (!tcp_any_retrans_done(sk))
2416 tp->retrans_stamp = 0;
2417 return true;
2418 }
2419 tcp_set_ca_state(sk, TCP_CA_Open);
2420 return false;
2421}
2422
2423
2424static bool tcp_try_undo_dsack(struct sock *sk)
2425{
2426 struct tcp_sock *tp = tcp_sk(sk);
2427
2428 if (tp->undo_marker && !tp->undo_retrans) {
2429 DBGUNDO(sk, "D-SACK");
2430 tcp_undo_cwnd_reduction(sk, false);
2431 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
2432 return true;
2433 }
2434 return false;
2435}
2436
2437
2438static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
2439{
2440 struct tcp_sock *tp = tcp_sk(sk);
2441
2442 if (frto_undo || tcp_may_undo(tp)) {
2443 tcp_undo_cwnd_reduction(sk, true);
2444
2445 DBGUNDO(sk, "partial loss");
2446 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
2447 if (frto_undo)
2448 NET_INC_STATS_BH(sock_net(sk),
2449 LINUX_MIB_TCPSPURIOUSRTOS);
2450 inet_csk(sk)->icsk_retransmits = 0;
2451 if (frto_undo || tcp_is_sack(tp))
2452 tcp_set_ca_state(sk, TCP_CA_Open);
2453 return true;
2454 }
2455 return false;
2456}
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467static void tcp_init_cwnd_reduction(struct sock *sk)
2468{
2469 struct tcp_sock *tp = tcp_sk(sk);
2470
2471 tp->high_seq = tp->snd_nxt;
2472 tp->tlp_high_seq = 0;
2473 tp->snd_cwnd_cnt = 0;
2474 tp->prior_cwnd = tp->snd_cwnd;
2475 tp->prr_delivered = 0;
2476 tp->prr_out = 0;
2477 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
2478 tcp_ecn_queue_cwr(tp);
2479}
2480
2481static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
2482 int flag)
2483{
2484 struct tcp_sock *tp = tcp_sk(sk);
2485 int sndcnt = 0;
2486 int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
2487
2488 if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
2489 return;
2490
2491 tp->prr_delivered += newly_acked_sacked;
2492 if (delta < 0) {
2493 u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
2494 tp->prior_cwnd - 1;
2495 sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
2496 } else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
2497 !(flag & FLAG_LOST_RETRANS)) {
2498 sndcnt = min_t(int, delta,
2499 max_t(int, tp->prr_delivered - tp->prr_out,
2500 newly_acked_sacked) + 1);
2501 } else {
2502 sndcnt = min(delta, newly_acked_sacked);
2503 }
2504
2505 sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
2506 tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
2507}
2508
2509static inline void tcp_end_cwnd_reduction(struct sock *sk)
2510{
2511 struct tcp_sock *tp = tcp_sk(sk);
2512
2513
2514 if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
2515 (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
2516 tp->snd_cwnd = tp->snd_ssthresh;
2517 tp->snd_cwnd_stamp = tcp_time_stamp;
2518 }
2519 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
2520}
2521
2522
2523void tcp_enter_cwr(struct sock *sk)
2524{
2525 struct tcp_sock *tp = tcp_sk(sk);
2526
2527 tp->prior_ssthresh = 0;
2528 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2529 tp->undo_marker = 0;
2530 tcp_init_cwnd_reduction(sk);
2531 tcp_set_ca_state(sk, TCP_CA_CWR);
2532 }
2533}
2534EXPORT_SYMBOL(tcp_enter_cwr);
2535
2536static void tcp_try_keep_open(struct sock *sk)
2537{
2538 struct tcp_sock *tp = tcp_sk(sk);
2539 int state = TCP_CA_Open;
2540
2541 if (tcp_left_out(tp) || tcp_any_retrans_done(sk))
2542 state = TCP_CA_Disorder;
2543
2544 if (inet_csk(sk)->icsk_ca_state != state) {
2545 tcp_set_ca_state(sk, state);
2546 tp->high_seq = tp->snd_nxt;
2547 }
2548}
2549
2550static void tcp_try_to_open(struct sock *sk, int flag)
2551{
2552 struct tcp_sock *tp = tcp_sk(sk);
2553
2554 tcp_verify_left_out(tp);
2555
2556 if (!tcp_any_retrans_done(sk))
2557 tp->retrans_stamp = 0;
2558
2559 if (flag & FLAG_ECE)
2560 tcp_enter_cwr(sk);
2561
2562 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
2563 tcp_try_keep_open(sk);
2564 }
2565}
2566
2567static void tcp_mtup_probe_failed(struct sock *sk)
2568{
2569 struct inet_connection_sock *icsk = inet_csk(sk);
2570
2571 icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
2572 icsk->icsk_mtup.probe_size = 0;
2573 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
2574}
2575
2576static void tcp_mtup_probe_success(struct sock *sk)
2577{
2578 struct tcp_sock *tp = tcp_sk(sk);
2579 struct inet_connection_sock *icsk = inet_csk(sk);
2580
2581
2582 tp->prior_ssthresh = tcp_current_ssthresh(sk);
2583 tp->snd_cwnd = tp->snd_cwnd *
2584 tcp_mss_to_mtu(sk, tp->mss_cache) /
2585 icsk->icsk_mtup.probe_size;
2586 tp->snd_cwnd_cnt = 0;
2587 tp->snd_cwnd_stamp = tcp_time_stamp;
2588 tp->snd_ssthresh = tcp_current_ssthresh(sk);
2589
2590 icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
2591 icsk->icsk_mtup.probe_size = 0;
2592 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
2593 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
2594}
2595
2596
2597
2598
2599
2600void tcp_simple_retransmit(struct sock *sk)
2601{
2602 const struct inet_connection_sock *icsk = inet_csk(sk);
2603 struct tcp_sock *tp = tcp_sk(sk);
2604 struct sk_buff *skb;
2605 unsigned int mss = tcp_current_mss(sk);
2606 u32 prior_lost = tp->lost_out;
2607
2608 tcp_for_write_queue(skb, sk) {
2609 if (skb == tcp_send_head(sk))
2610 break;
2611 if (tcp_skb_seglen(skb) > mss &&
2612 !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
2613 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
2614 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
2615 tp->retrans_out -= tcp_skb_pcount(skb);
2616 }
2617 tcp_skb_mark_lost_uncond_verify(tp, skb);
2618 }
2619 }
2620
2621 tcp_clear_retrans_hints_partial(tp);
2622
2623 if (prior_lost == tp->lost_out)
2624 return;
2625
2626 if (tcp_is_reno(tp))
2627 tcp_limit_reno_sacked(tp);
2628
2629 tcp_verify_left_out(tp);
2630
2631
2632
2633
2634
2635
2636 if (icsk->icsk_ca_state != TCP_CA_Loss) {
2637 tp->high_seq = tp->snd_nxt;
2638 tp->snd_ssthresh = tcp_current_ssthresh(sk);
2639 tp->prior_ssthresh = 0;
2640 tp->undo_marker = 0;
2641 tcp_set_ca_state(sk, TCP_CA_Loss);
2642 }
2643 tcp_xmit_retransmit_queue(sk);
2644}
2645EXPORT_SYMBOL(tcp_simple_retransmit);
2646
2647static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2648{
2649 struct tcp_sock *tp = tcp_sk(sk);
2650 int mib_idx;
2651
2652 if (tcp_is_reno(tp))
2653 mib_idx = LINUX_MIB_TCPRENORECOVERY;
2654 else
2655 mib_idx = LINUX_MIB_TCPSACKRECOVERY;
2656
2657 NET_INC_STATS_BH(sock_net(sk), mib_idx);
2658
2659 tp->prior_ssthresh = 0;
2660 tcp_init_undo(tp);
2661
2662 if (!tcp_in_cwnd_reduction(sk)) {
2663 if (!ece_ack)
2664 tp->prior_ssthresh = tcp_current_ssthresh(sk);
2665 tcp_init_cwnd_reduction(sk);
2666 }
2667 tcp_set_ca_state(sk, TCP_CA_Recovery);
2668}
2669
2670
2671
2672
2673static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
2674 int *rexmit)
2675{
2676 struct tcp_sock *tp = tcp_sk(sk);
2677 bool recovered = !before(tp->snd_una, tp->high_seq);
2678
2679 if ((flag & FLAG_SND_UNA_ADVANCED) &&
2680 tcp_try_undo_loss(sk, false))
2681 return;
2682
2683 if (tp->frto) {
2684
2685
2686
2687 if ((flag & FLAG_ORIG_SACK_ACKED) &&
2688 tcp_try_undo_loss(sk, true))
2689 return;
2690
2691 if (after(tp->snd_nxt, tp->high_seq)) {
2692 if (flag & FLAG_DATA_SACKED || is_dupack)
2693 tp->frto = 0;
2694 } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
2695 tp->high_seq = tp->snd_nxt;
2696
2697
2698
2699
2700 if (tcp_send_head(sk) &&
2701 after(tcp_wnd_end(tp), tp->snd_nxt)) {
2702 *rexmit = REXMIT_NEW;
2703 return;
2704 }
2705 tp->frto = 0;
2706 }
2707 }
2708
2709 if (recovered) {
2710
2711 tcp_try_undo_recovery(sk);
2712 return;
2713 }
2714 if (tcp_is_reno(tp)) {
2715
2716
2717
2718 if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
2719 tcp_add_reno_sack(sk);
2720 else if (flag & FLAG_SND_UNA_ADVANCED)
2721 tcp_reset_reno_sack(tp);
2722 }
2723 *rexmit = REXMIT_LOST;
2724}
2725
2726
2727static bool tcp_try_undo_partial(struct sock *sk, const int acked)
2728{
2729 struct tcp_sock *tp = tcp_sk(sk);
2730
2731 if (tp->undo_marker && tcp_packet_delayed(tp)) {
2732
2733
2734
2735 tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
2736
2737
2738
2739
2740
2741
2742 if (tp->retrans_out)
2743 return true;
2744
2745 if (!tcp_any_retrans_done(sk))
2746 tp->retrans_stamp = 0;
2747
2748 DBGUNDO(sk, "partial recovery");
2749 tcp_undo_cwnd_reduction(sk, true);
2750 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
2751 tcp_try_keep_open(sk);
2752 return true;
2753 }
2754 return false;
2755}
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2770 bool is_dupack, int *ack_flag, int *rexmit)
2771{
2772 struct inet_connection_sock *icsk = inet_csk(sk);
2773 struct tcp_sock *tp = tcp_sk(sk);
2774 int fast_rexmit = 0, flag = *ack_flag;
2775 bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
2776 (tcp_fackets_out(tp) > tp->reordering));
2777
2778 if (WARN_ON(!tp->packets_out && tp->sacked_out))
2779 tp->sacked_out = 0;
2780 if (WARN_ON(!tp->sacked_out && tp->fackets_out))
2781 tp->fackets_out = 0;
2782
2783
2784
2785 if (flag & FLAG_ECE)
2786 tp->prior_ssthresh = 0;
2787
2788
2789 if (tcp_check_sack_reneging(sk, flag))
2790 return;
2791
2792
2793 tcp_verify_left_out(tp);
2794
2795
2796
2797 if (icsk->icsk_ca_state == TCP_CA_Open) {
2798 WARN_ON(tp->retrans_out != 0);
2799 tp->retrans_stamp = 0;
2800 } else if (!before(tp->snd_una, tp->high_seq)) {
2801 switch (icsk->icsk_ca_state) {
2802 case TCP_CA_CWR:
2803
2804
2805 if (tp->snd_una != tp->high_seq) {
2806 tcp_end_cwnd_reduction(sk);
2807 tcp_set_ca_state(sk, TCP_CA_Open);
2808 }
2809 break;
2810
2811 case TCP_CA_Recovery:
2812 if (tcp_is_reno(tp))
2813 tcp_reset_reno_sack(tp);
2814 if (tcp_try_undo_recovery(sk))
2815 return;
2816 tcp_end_cwnd_reduction(sk);
2817 break;
2818 }
2819 }
2820
2821
2822 if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
2823 tcp_rack_mark_lost(sk)) {
2824 flag |= FLAG_LOST_RETRANS;
2825 *ack_flag |= FLAG_LOST_RETRANS;
2826 }
2827
2828
2829 switch (icsk->icsk_ca_state) {
2830 case TCP_CA_Recovery:
2831 if (!(flag & FLAG_SND_UNA_ADVANCED)) {
2832 if (tcp_is_reno(tp) && is_dupack)
2833 tcp_add_reno_sack(sk);
2834 } else {
2835 if (tcp_try_undo_partial(sk, acked))
2836 return;
2837
2838 do_lost = tcp_is_reno(tp) ||
2839 tcp_fackets_out(tp) > tp->reordering;
2840 }
2841 if (tcp_try_undo_dsack(sk)) {
2842 tcp_try_keep_open(sk);
2843 return;
2844 }
2845 break;
2846 case TCP_CA_Loss:
2847 tcp_process_loss(sk, flag, is_dupack, rexmit);
2848 if (icsk->icsk_ca_state != TCP_CA_Open &&
2849 !(flag & FLAG_LOST_RETRANS))
2850 return;
2851
2852 default:
2853 if (tcp_is_reno(tp)) {
2854 if (flag & FLAG_SND_UNA_ADVANCED)
2855 tcp_reset_reno_sack(tp);
2856 if (is_dupack)
2857 tcp_add_reno_sack(sk);
2858 }
2859
2860 if (icsk->icsk_ca_state <= TCP_CA_Disorder)
2861 tcp_try_undo_dsack(sk);
2862
2863 if (!tcp_time_to_recover(sk, flag)) {
2864 tcp_try_to_open(sk, flag);
2865 return;
2866 }
2867
2868
2869 if (icsk->icsk_ca_state < TCP_CA_CWR &&
2870 icsk->icsk_mtup.probe_size &&
2871 tp->snd_una == tp->mtu_probe.probe_seq_start) {
2872 tcp_mtup_probe_failed(sk);
2873
2874 tp->snd_cwnd++;
2875 tcp_simple_retransmit(sk);
2876 return;
2877 }
2878
2879
2880 tcp_enter_recovery(sk, (flag & FLAG_ECE));
2881 fast_rexmit = 1;
2882 }
2883
2884 if (do_lost)
2885 tcp_update_scoreboard(sk, fast_rexmit);
2886 *rexmit = REXMIT_LOST;
2887}
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
2908{
2909 const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
2910 struct rtt_meas *m = tcp_sk(sk)->rtt_min;
2911 struct rtt_meas rttm = {
2912 .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1),
2913 .ts = now,
2914 };
2915 u32 elapsed;
2916
2917
2918 if (unlikely(rttm.rtt <= m[0].rtt))
2919 m[0] = m[1] = m[2] = rttm;
2920 else if (rttm.rtt <= m[1].rtt)
2921 m[1] = m[2] = rttm;
2922 else if (rttm.rtt <= m[2].rtt)
2923 m[2] = rttm;
2924
2925 elapsed = now - m[0].ts;
2926 if (unlikely(elapsed > wlen)) {
2927
2928
2929
2930 m[0] = m[1];
2931 m[1] = m[2];
2932 m[2] = rttm;
2933 if (now - m[0].ts > wlen) {
2934 m[0] = m[1];
2935 m[1] = rttm;
2936 if (now - m[0].ts > wlen)
2937 m[0] = rttm;
2938 }
2939 } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
2940
2941
2942
2943 m[2] = m[1] = rttm;
2944 } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
2945
2946
2947
2948 m[2] = rttm;
2949 }
2950}
2951
2952static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2953 long seq_rtt_us, long sack_rtt_us,
2954 long ca_rtt_us)
2955{
2956 const struct tcp_sock *tp = tcp_sk(sk);
2957
2958
2959
2960
2961
2962
2963 if (seq_rtt_us < 0)
2964 seq_rtt_us = sack_rtt_us;
2965
2966
2967
2968
2969
2970
2971
2972 if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
2973 flag & FLAG_ACKED)
2974 seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp -
2975 tp->rx_opt.rcv_tsecr);
2976 if (seq_rtt_us < 0)
2977 return false;
2978
2979
2980
2981
2982
2983 tcp_update_rtt_min(sk, ca_rtt_us);
2984 tcp_rtt_estimator(sk, seq_rtt_us);
2985 tcp_set_rto(sk);
2986
2987
2988 inet_csk(sk)->icsk_backoff = 0;
2989 return true;
2990}
2991
2992
2993void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
2994{
2995 long rtt_us = -1L;
2996
2997 if (req && !req->num_retrans && tcp_rsk(req)->snt_synack.v64) {
2998 struct skb_mstamp now;
2999
3000 skb_mstamp_get(&now);
3001 rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack);
3002 }
3003
3004 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us);
3005}
3006
3007
3008static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
3009{
3010 const struct inet_connection_sock *icsk = inet_csk(sk);
3011
3012 icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
3013 tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
3014}
3015
3016
3017
3018
3019void tcp_rearm_rto(struct sock *sk)
3020{
3021 const struct inet_connection_sock *icsk = inet_csk(sk);
3022 struct tcp_sock *tp = tcp_sk(sk);
3023
3024
3025
3026
3027 if (tp->fastopen_rsk)
3028 return;
3029
3030 if (!tp->packets_out) {
3031 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
3032 } else {
3033 u32 rto = inet_csk(sk)->icsk_rto;
3034
3035 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
3036 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
3037 struct sk_buff *skb = tcp_write_queue_head(sk);
3038 const u32 rto_time_stamp =
3039 tcp_skb_timestamp(skb) + rto;
3040 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
3041
3042
3043
3044 if (delta > 0)
3045 rto = delta;
3046 }
3047 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
3048 TCP_RTO_MAX);
3049 }
3050}
3051
3052
3053
3054
3055void tcp_resume_early_retransmit(struct sock *sk)
3056{
3057 struct tcp_sock *tp = tcp_sk(sk);
3058
3059 tcp_rearm_rto(sk);
3060
3061
3062 if (!tp->do_early_retrans)
3063 return;
3064
3065 tcp_enter_recovery(sk, false);
3066 tcp_update_scoreboard(sk, 1);
3067 tcp_xmit_retransmit_queue(sk);
3068}
3069
3070
3071static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
3072{
3073 struct tcp_sock *tp = tcp_sk(sk);
3074 u32 packets_acked;
3075
3076 BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
3077
3078 packets_acked = tcp_skb_pcount(skb);
3079 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
3080 return 0;
3081 packets_acked -= tcp_skb_pcount(skb);
3082
3083 if (packets_acked) {
3084 BUG_ON(tcp_skb_pcount(skb) == 0);
3085 BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
3086 }
3087
3088 return packets_acked;
3089}
3090
3091static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
3092 u32 prior_snd_una)
3093{
3094 const struct skb_shared_info *shinfo;
3095
3096
3097 if (likely(!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)))
3098 return;
3099
3100 shinfo = skb_shinfo(skb);
3101 if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
3102 !before(shinfo->tskey, prior_snd_una) &&
3103 before(shinfo->tskey, tcp_sk(sk)->snd_una))
3104 __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
3105}
3106
3107
3108
3109
3110
3111static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3112 u32 prior_snd_una, int *acked,
3113 struct tcp_sacktag_state *sack)
3114{
3115 const struct inet_connection_sock *icsk = inet_csk(sk);
3116 struct skb_mstamp first_ackt, last_ackt, now;
3117 struct tcp_sock *tp = tcp_sk(sk);
3118 u32 prior_sacked = tp->sacked_out;
3119 u32 reord = tp->packets_out;
3120 bool fully_acked = true;
3121 long sack_rtt_us = -1L;
3122 long seq_rtt_us = -1L;
3123 long ca_rtt_us = -1L;
3124 struct sk_buff *skb;
3125 u32 pkts_acked = 0;
3126 bool rtt_update;
3127 int flag = 0;
3128
3129 first_ackt.v64 = 0;
3130
3131 while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
3132 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
3133 u8 sacked = scb->sacked;
3134 u32 acked_pcount;
3135
3136 tcp_ack_tstamp(sk, skb, prior_snd_una);
3137
3138
3139 if (after(scb->end_seq, tp->snd_una)) {
3140 if (tcp_skb_pcount(skb) == 1 ||
3141 !after(tp->snd_una, scb->seq))
3142 break;
3143
3144 acked_pcount = tcp_tso_acked(sk, skb);
3145 if (!acked_pcount)
3146 break;
3147
3148 fully_acked = false;
3149 } else {
3150
3151 prefetchw(skb->next);
3152 acked_pcount = tcp_skb_pcount(skb);
3153 }
3154
3155 if (unlikely(sacked & TCPCB_RETRANS)) {
3156 if (sacked & TCPCB_SACKED_RETRANS)
3157 tp->retrans_out -= acked_pcount;
3158 flag |= FLAG_RETRANS_DATA_ACKED;
3159 } else if (!(sacked & TCPCB_SACKED_ACKED)) {
3160 last_ackt = skb->skb_mstamp;
3161 WARN_ON_ONCE(last_ackt.v64 == 0);
3162 if (!first_ackt.v64)
3163 first_ackt = last_ackt;
3164
3165 reord = min(pkts_acked, reord);
3166 if (!after(scb->end_seq, tp->high_seq))
3167 flag |= FLAG_ORIG_SACK_ACKED;
3168 }
3169
3170 if (sacked & TCPCB_SACKED_ACKED) {
3171 tp->sacked_out -= acked_pcount;
3172 } else if (tcp_is_sack(tp)) {
3173 tp->delivered += acked_pcount;
3174 if (!tcp_skb_spurious_retrans(tp, skb))
3175 tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
3176 }
3177 if (sacked & TCPCB_LOST)
3178 tp->lost_out -= acked_pcount;
3179
3180 tp->packets_out -= acked_pcount;
3181 pkts_acked += acked_pcount;
3182
3183
3184
3185
3186
3187
3188
3189
3190 if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
3191 flag |= FLAG_DATA_ACKED;
3192 } else {
3193 flag |= FLAG_SYN_ACKED;
3194 tp->retrans_stamp = 0;
3195 }
3196
3197 if (!fully_acked)
3198 break;
3199
3200 tcp_unlink_write_queue(skb, sk);
3201 sk_wmem_free_skb(sk, skb);
3202 if (unlikely(skb == tp->retransmit_skb_hint))
3203 tp->retransmit_skb_hint = NULL;
3204 if (unlikely(skb == tp->lost_skb_hint))
3205 tp->lost_skb_hint = NULL;
3206 }
3207
3208 if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
3209 tp->snd_up = tp->snd_una;
3210
3211 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
3212 flag |= FLAG_SACK_RENEGING;
3213
3214 skb_mstamp_get(&now);
3215 if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
3216 seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
3217 ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
3218 }
3219 if (sack->first_sackt.v64) {
3220 sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt);
3221 ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
3222 }
3223
3224 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
3225 ca_rtt_us);
3226
3227 if (flag & FLAG_ACKED) {
3228 tcp_rearm_rto(sk);
3229 if (unlikely(icsk->icsk_mtup.probe_size &&
3230 !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
3231 tcp_mtup_probe_success(sk);
3232 }
3233
3234 if (tcp_is_reno(tp)) {
3235 tcp_remove_reno_sacks(sk, pkts_acked);
3236 } else {
3237 int delta;
3238
3239
3240 if (reord < prior_fackets)
3241 tcp_update_reordering(sk, tp->fackets_out - reord, 0);
3242
3243 delta = tcp_is_fack(tp) ? pkts_acked :
3244 prior_sacked - tp->sacked_out;
3245 tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
3246 }
3247
3248 tp->fackets_out -= min(pkts_acked, tp->fackets_out);
3249
3250 } else if (skb && rtt_update && sack_rtt_us >= 0 &&
3251 sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
3252
3253
3254
3255
3256 tcp_rearm_rto(sk);
3257 }
3258
3259 if (icsk->icsk_ca_ops->pkts_acked)
3260 icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us);
3261
3262#if FASTRETRANS_DEBUG > 0
3263 WARN_ON((int)tp->sacked_out < 0);
3264 WARN_ON((int)tp->lost_out < 0);
3265 WARN_ON((int)tp->retrans_out < 0);
3266 if (!tp->packets_out && tcp_is_sack(tp)) {
3267 icsk = inet_csk(sk);
3268 if (tp->lost_out) {
3269 pr_debug("Leak l=%u %d\n",
3270 tp->lost_out, icsk->icsk_ca_state);
3271 tp->lost_out = 0;
3272 }
3273 if (tp->sacked_out) {
3274 pr_debug("Leak s=%u %d\n",
3275 tp->sacked_out, icsk->icsk_ca_state);
3276 tp->sacked_out = 0;
3277 }
3278 if (tp->retrans_out) {
3279 pr_debug("Leak r=%u %d\n",
3280 tp->retrans_out, icsk->icsk_ca_state);
3281 tp->retrans_out = 0;
3282 }
3283 }
3284#endif
3285 *acked = pkts_acked;
3286 return flag;
3287}
3288
3289static void tcp_ack_probe(struct sock *sk)
3290{
3291 const struct tcp_sock *tp = tcp_sk(sk);
3292 struct inet_connection_sock *icsk = inet_csk(sk);
3293
3294
3295
3296 if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
3297 icsk->icsk_backoff = 0;
3298 inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
3299
3300
3301
3302 } else {
3303 unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
3304
3305 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3306 when, TCP_RTO_MAX);
3307 }
3308}
3309
3310static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
3311{
3312 return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
3313 inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
3314}
3315
3316
3317static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
3318{
3319
3320
3321
3322
3323
3324
3325 if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering)
3326 return flag & FLAG_FORWARD_PROGRESS;
3327
3328 return flag & FLAG_DATA_ACKED;
3329}
3330
3331
3332
3333
3334
3335
3336static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
3337 int flag)
3338{
3339 if (tcp_in_cwnd_reduction(sk)) {
3340
3341 tcp_cwnd_reduction(sk, acked_sacked, flag);
3342 } else if (tcp_may_raise_cwnd(sk, flag)) {
3343
3344 tcp_cong_avoid(sk, ack, acked_sacked);
3345 }
3346 tcp_update_pacing_rate(sk);
3347}
3348
3349
3350
3351
3352static inline bool tcp_may_update_window(const struct tcp_sock *tp,
3353 const u32 ack, const u32 ack_seq,
3354 const u32 nwin)
3355{
3356 return after(ack, tp->snd_una) ||
3357 after(ack_seq, tp->snd_wl1) ||
3358 (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
3359}
3360
3361
3362static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
3363{
3364 u32 delta = ack - tp->snd_una;
3365
3366 u64_stats_update_begin(&tp->syncp);
3367 tp->bytes_acked += delta;
3368 u64_stats_update_end(&tp->syncp);
3369 tp->snd_una = ack;
3370}
3371
3372
3373static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
3374{
3375 u32 delta = seq - tp->rcv_nxt;
3376
3377 u64_stats_update_begin(&tp->syncp);
3378 tp->bytes_received += delta;
3379 u64_stats_update_end(&tp->syncp);
3380 tp->rcv_nxt = seq;
3381}
3382
3383
3384
3385
3386
3387
3388static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack,
3389 u32 ack_seq)
3390{
3391 struct tcp_sock *tp = tcp_sk(sk);
3392 int flag = 0;
3393 u32 nwin = ntohs(tcp_hdr(skb)->window);
3394
3395 if (likely(!tcp_hdr(skb)->syn))
3396 nwin <<= tp->rx_opt.snd_wscale;
3397
3398 if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
3399 flag |= FLAG_WIN_UPDATE;
3400 tcp_update_wl(tp, ack_seq);
3401
3402 if (tp->snd_wnd != nwin) {
3403 tp->snd_wnd = nwin;
3404
3405
3406
3407
3408 tp->pred_flags = 0;
3409 tcp_fast_path_check(sk);
3410
3411 if (tcp_send_head(sk))
3412 tcp_slow_start_after_idle_check(sk);
3413
3414 if (nwin > tp->max_window) {
3415 tp->max_window = nwin;
3416 tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
3417 }
3418 }
3419 }
3420
3421 tcp_snd_una_update(tp, ack);
3422
3423 return flag;
3424}
3425
3426
3427
3428
3429
3430
3431
3432
3433bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
3434 int mib_idx, u32 *last_oow_ack_time)
3435{
3436
3437 if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
3438 !tcp_hdr(skb)->syn)
3439 goto not_rate_limited;
3440
3441 if (*last_oow_ack_time) {
3442 s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
3443
3444 if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
3445 NET_INC_STATS_BH(net, mib_idx);
3446 return true;
3447 }
3448 }
3449
3450 *last_oow_ack_time = tcp_time_stamp;
3451
3452not_rate_limited:
3453 return false;
3454}
3455
3456
3457static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
3458{
3459
3460 static u32 challenge_timestamp;
3461 static unsigned int challenge_count;
3462 struct tcp_sock *tp = tcp_sk(sk);
3463 u32 now;
3464
3465
3466 if (tcp_oow_rate_limited(sock_net(sk), skb,
3467 LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
3468 &tp->last_oow_ack_time))
3469 return;
3470
3471
3472 now = jiffies / HZ;
3473 if (now != challenge_timestamp) {
3474 challenge_timestamp = now;
3475 challenge_count = 0;
3476 }
3477 if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
3478 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
3479 tcp_send_ack(sk);
3480 }
3481}
3482
3483static void tcp_store_ts_recent(struct tcp_sock *tp)
3484{
3485 tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
3486 tp->rx_opt.ts_recent_stamp = get_seconds();
3487}
3488
3489static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
3490{
3491 if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
3492
3493
3494
3495
3496
3497
3498
3499 if (tcp_paws_check(&tp->rx_opt, 0))
3500 tcp_store_ts_recent(tp);
3501 }
3502}
3503
3504
3505
3506
3507
3508
3509static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
3510{
3511 struct tcp_sock *tp = tcp_sk(sk);
3512
3513 if (before(ack, tp->tlp_high_seq))
3514 return;
3515
3516 if (flag & FLAG_DSACKING_ACK) {
3517
3518 tp->tlp_high_seq = 0;
3519 } else if (after(ack, tp->tlp_high_seq)) {
3520
3521
3522
3523 tcp_init_cwnd_reduction(sk);
3524 tcp_set_ca_state(sk, TCP_CA_CWR);
3525 tcp_end_cwnd_reduction(sk);
3526 tcp_try_keep_open(sk);
3527 NET_INC_STATS_BH(sock_net(sk),
3528 LINUX_MIB_TCPLOSSPROBERECOVERY);
3529 } else if (!(flag & (FLAG_SND_UNA_ADVANCED |
3530 FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
3531
3532 tp->tlp_high_seq = 0;
3533 }
3534}
3535
3536static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
3537{
3538 const struct inet_connection_sock *icsk = inet_csk(sk);
3539
3540 if (icsk->icsk_ca_ops->in_ack_event)
3541 icsk->icsk_ca_ops->in_ack_event(sk, flags);
3542}
3543
3544
3545
3546
3547
3548static void tcp_xmit_recovery(struct sock *sk, int rexmit)
3549{
3550 struct tcp_sock *tp = tcp_sk(sk);
3551
3552 if (rexmit == REXMIT_NONE)
3553 return;
3554
3555 if (unlikely(rexmit == 2)) {
3556 __tcp_push_pending_frames(sk, tcp_current_mss(sk),
3557 TCP_NAGLE_OFF);
3558 if (after(tp->snd_nxt, tp->high_seq))
3559 return;
3560 tp->frto = 0;
3561 }
3562 tcp_xmit_retransmit_queue(sk);
3563}
3564
3565
3566static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3567{
3568 struct inet_connection_sock *icsk = inet_csk(sk);
3569 struct tcp_sock *tp = tcp_sk(sk);
3570 struct tcp_sacktag_state sack_state;
3571 u32 prior_snd_una = tp->snd_una;
3572 u32 ack_seq = TCP_SKB_CB(skb)->seq;
3573 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3574 bool is_dupack = false;
3575 u32 prior_fackets;
3576 int prior_packets = tp->packets_out;
3577 u32 prior_delivered = tp->delivered;
3578 int acked = 0;
3579 int rexmit = REXMIT_NONE;
3580
3581 sack_state.first_sackt.v64 = 0;
3582
3583
3584 prefetchw(sk->sk_write_queue.next);
3585
3586
3587
3588
3589 if (before(ack, prior_snd_una)) {
3590
3591 if (before(ack, prior_snd_una - tp->max_window)) {
3592 tcp_send_challenge_ack(sk, skb);
3593 return -1;
3594 }
3595 goto old_ack;
3596 }
3597
3598
3599
3600
3601 if (after(ack, tp->snd_nxt))
3602 goto invalid_ack;
3603
3604 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
3605 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
3606 tcp_rearm_rto(sk);
3607
3608 if (after(ack, prior_snd_una)) {
3609 flag |= FLAG_SND_UNA_ADVANCED;
3610 icsk->icsk_retransmits = 0;
3611 }
3612
3613 prior_fackets = tp->fackets_out;
3614
3615
3616
3617
3618 if (flag & FLAG_UPDATE_TS_RECENT)
3619 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
3620
3621 if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
3622
3623
3624
3625
3626 tcp_update_wl(tp, ack_seq);
3627 tcp_snd_una_update(tp, ack);
3628 flag |= FLAG_WIN_UPDATE;
3629
3630 tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
3631
3632 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
3633 } else {
3634 u32 ack_ev_flags = CA_ACK_SLOWPATH;
3635
3636 if (ack_seq != TCP_SKB_CB(skb)->end_seq)
3637 flag |= FLAG_DATA;
3638 else
3639 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS);
3640
3641 flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
3642
3643 if (TCP_SKB_CB(skb)->sacked)
3644 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3645 &sack_state);
3646
3647 if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
3648 flag |= FLAG_ECE;
3649 ack_ev_flags |= CA_ACK_ECE;
3650 }
3651
3652 if (flag & FLAG_WIN_UPDATE)
3653 ack_ev_flags |= CA_ACK_WIN_UPDATE;
3654
3655 tcp_in_ack_event(sk, ack_ev_flags);
3656 }
3657
3658
3659
3660
3661 sk->sk_err_soft = 0;
3662 icsk->icsk_probes_out = 0;
3663 tp->rcv_tstamp = tcp_time_stamp;
3664 if (!prior_packets)
3665 goto no_queue;
3666
3667
3668 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
3669 &sack_state);
3670
3671 if (tcp_ack_is_dubious(sk, flag)) {
3672 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3673 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
3674 }
3675 if (tp->tlp_high_seq)
3676 tcp_process_tlp_ack(sk, ack, flag);
3677
3678 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
3679 struct dst_entry *dst = __sk_dst_get(sk);
3680 if (dst)
3681 dst_confirm(dst);
3682 }
3683
3684 if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3685 tcp_schedule_loss_probe(sk);
3686 tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag);
3687 tcp_xmit_recovery(sk, rexmit);
3688 return 1;
3689
3690no_queue:
3691
3692 if (flag & FLAG_DSACKING_ACK)
3693 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
3694
3695
3696
3697
3698 if (tcp_send_head(sk))
3699 tcp_ack_probe(sk);
3700
3701 if (tp->tlp_high_seq)
3702 tcp_process_tlp_ack(sk, ack, flag);
3703 return 1;
3704
3705invalid_ack:
3706 SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3707 return -1;
3708
3709old_ack:
3710
3711
3712
3713 if (TCP_SKB_CB(skb)->sacked) {
3714 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3715 &sack_state);
3716 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
3717 tcp_xmit_recovery(sk, rexmit);
3718 }
3719
3720 SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3721 return 0;
3722}
3723
3724static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
3725 bool syn, struct tcp_fastopen_cookie *foc,
3726 bool exp_opt)
3727{
3728
3729 if (!foc || !syn || len < 0 || (len & 1))
3730 return;
3731
3732 if (len >= TCP_FASTOPEN_COOKIE_MIN &&
3733 len <= TCP_FASTOPEN_COOKIE_MAX)
3734 memcpy(foc->val, cookie, len);
3735 else if (len != 0)
3736 len = -1;
3737 foc->len = len;
3738 foc->exp = exp_opt;
3739}
3740
3741
3742
3743
3744
3745void tcp_parse_options(const struct sk_buff *skb,
3746 struct tcp_options_received *opt_rx, int estab,
3747 struct tcp_fastopen_cookie *foc)
3748{
3749 const unsigned char *ptr;
3750 const struct tcphdr *th = tcp_hdr(skb);
3751 int length = (th->doff * 4) - sizeof(struct tcphdr);
3752
3753 ptr = (const unsigned char *)(th + 1);
3754 opt_rx->saw_tstamp = 0;
3755
3756 while (length > 0) {
3757 int opcode = *ptr++;
3758 int opsize;
3759
3760 switch (opcode) {
3761 case TCPOPT_EOL:
3762 return;
3763 case TCPOPT_NOP:
3764 length--;
3765 continue;
3766 default:
3767 opsize = *ptr++;
3768 if (opsize < 2)
3769 return;
3770 if (opsize > length)
3771 return;
3772 switch (opcode) {
3773 case TCPOPT_MSS:
3774 if (opsize == TCPOLEN_MSS && th->syn && !estab) {
3775 u16 in_mss = get_unaligned_be16(ptr);
3776 if (in_mss) {
3777 if (opt_rx->user_mss &&
3778 opt_rx->user_mss < in_mss)
3779 in_mss = opt_rx->user_mss;
3780 opt_rx->mss_clamp = in_mss;
3781 }
3782 }
3783 break;
3784 case TCPOPT_WINDOW:
3785 if (opsize == TCPOLEN_WINDOW && th->syn &&
3786 !estab && sysctl_tcp_window_scaling) {
3787 __u8 snd_wscale = *(__u8 *)ptr;
3788 opt_rx->wscale_ok = 1;
3789 if (snd_wscale > 14) {
3790 net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n",
3791 __func__,
3792 snd_wscale);
3793 snd_wscale = 14;
3794 }
3795 opt_rx->snd_wscale = snd_wscale;
3796 }
3797 break;
3798 case TCPOPT_TIMESTAMP:
3799 if ((opsize == TCPOLEN_TIMESTAMP) &&
3800 ((estab && opt_rx->tstamp_ok) ||
3801 (!estab && sysctl_tcp_timestamps))) {
3802 opt_rx->saw_tstamp = 1;
3803 opt_rx->rcv_tsval = get_unaligned_be32(ptr);
3804 opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
3805 }
3806 break;
3807 case TCPOPT_SACK_PERM:
3808 if (opsize == TCPOLEN_SACK_PERM && th->syn &&
3809 !estab && sysctl_tcp_sack) {
3810 opt_rx->sack_ok = TCP_SACK_SEEN;
3811 tcp_sack_reset(opt_rx);
3812 }
3813 break;
3814
3815 case TCPOPT_SACK:
3816 if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
3817 !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
3818 opt_rx->sack_ok) {
3819 TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
3820 }
3821 break;
3822#ifdef CONFIG_TCP_MD5SIG
3823 case TCPOPT_MD5SIG:
3824
3825
3826
3827
3828 break;
3829#endif
3830 case TCPOPT_FASTOPEN:
3831 tcp_parse_fastopen_option(
3832 opsize - TCPOLEN_FASTOPEN_BASE,
3833 ptr, th->syn, foc, false);
3834 break;
3835
3836 case TCPOPT_EXP:
3837
3838
3839
3840 if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
3841 get_unaligned_be16(ptr) ==
3842 TCPOPT_FASTOPEN_MAGIC)
3843 tcp_parse_fastopen_option(opsize -
3844 TCPOLEN_EXP_FASTOPEN_BASE,
3845 ptr + 2, th->syn, foc, true);
3846 break;
3847
3848 }
3849 ptr += opsize-2;
3850 length -= opsize;
3851 }
3852 }
3853}
3854EXPORT_SYMBOL(tcp_parse_options);
3855
3856static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
3857{
3858 const __be32 *ptr = (const __be32 *)(th + 1);
3859
3860 if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
3861 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
3862 tp->rx_opt.saw_tstamp = 1;
3863 ++ptr;
3864 tp->rx_opt.rcv_tsval = ntohl(*ptr);
3865 ++ptr;
3866 if (*ptr)
3867 tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
3868 else
3869 tp->rx_opt.rcv_tsecr = 0;
3870 return true;
3871 }
3872 return false;
3873}
3874
3875
3876
3877
3878static bool tcp_fast_parse_options(const struct sk_buff *skb,
3879 const struct tcphdr *th, struct tcp_sock *tp)
3880{
3881
3882
3883
3884 if (th->doff == (sizeof(*th) / 4)) {
3885 tp->rx_opt.saw_tstamp = 0;
3886 return false;
3887 } else if (tp->rx_opt.tstamp_ok &&
3888 th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
3889 if (tcp_parse_aligned_timestamp(tp, th))
3890 return true;
3891 }
3892
3893 tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
3894 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
3895 tp->rx_opt.rcv_tsecr -= tp->tsoffset;
3896
3897 return true;
3898}
3899
3900#ifdef CONFIG_TCP_MD5SIG
3901
3902
3903
3904const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
3905{
3906 int length = (th->doff << 2) - sizeof(*th);
3907 const u8 *ptr = (const u8 *)(th + 1);
3908
3909
3910 if (length < TCPOLEN_MD5SIG)
3911 return NULL;
3912
3913 while (length > 0) {
3914 int opcode = *ptr++;
3915 int opsize;
3916
3917 switch (opcode) {
3918 case TCPOPT_EOL:
3919 return NULL;
3920 case TCPOPT_NOP:
3921 length--;
3922 continue;
3923 default:
3924 opsize = *ptr++;
3925 if (opsize < 2 || opsize > length)
3926 return NULL;
3927 if (opcode == TCPOPT_MD5SIG)
3928 return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
3929 }
3930 ptr += opsize - 2;
3931 length -= opsize;
3932 }
3933 return NULL;
3934}
3935EXPORT_SYMBOL(tcp_parse_md5sig_option);
3936#endif
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
3962{
3963 const struct tcp_sock *tp = tcp_sk(sk);
3964 const struct tcphdr *th = tcp_hdr(skb);
3965 u32 seq = TCP_SKB_CB(skb)->seq;
3966 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3967
3968 return (
3969 (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
3970
3971
3972 ack == tp->snd_una &&
3973
3974
3975 !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
3976
3977
3978 (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
3979}
3980
3981static inline bool tcp_paws_discard(const struct sock *sk,
3982 const struct sk_buff *skb)
3983{
3984 const struct tcp_sock *tp = tcp_sk(sk);
3985
3986 return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
3987 !tcp_disordered_ack(sk, skb);
3988}
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
4004{
4005 return !before(end_seq, tp->rcv_wup) &&
4006 !after(seq, tp->rcv_nxt + tcp_receive_window(tp));
4007}
4008
4009
4010void tcp_reset(struct sock *sk)
4011{
4012
4013 switch (sk->sk_state) {
4014 case TCP_SYN_SENT:
4015 sk->sk_err = ECONNREFUSED;
4016 break;
4017 case TCP_CLOSE_WAIT:
4018 sk->sk_err = EPIPE;
4019 break;
4020 case TCP_CLOSE:
4021 return;
4022 default:
4023 sk->sk_err = ECONNRESET;
4024 }
4025
4026 smp_wmb();
4027
4028 if (!sock_flag(sk, SOCK_DEAD))
4029 sk->sk_error_report(sk);
4030
4031 tcp_done(sk);
4032}
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048void tcp_fin(struct sock *sk)
4049{
4050 struct tcp_sock *tp = tcp_sk(sk);
4051
4052 inet_csk_schedule_ack(sk);
4053
4054 sk->sk_shutdown |= RCV_SHUTDOWN;
4055 sock_set_flag(sk, SOCK_DONE);
4056
4057 switch (sk->sk_state) {
4058 case TCP_SYN_RECV:
4059 case TCP_ESTABLISHED:
4060
4061 tcp_set_state(sk, TCP_CLOSE_WAIT);
4062 inet_csk(sk)->icsk_ack.pingpong = 1;
4063 break;
4064
4065 case TCP_CLOSE_WAIT:
4066 case TCP_CLOSING:
4067
4068
4069
4070 break;
4071 case TCP_LAST_ACK:
4072
4073 break;
4074
4075 case TCP_FIN_WAIT1:
4076
4077
4078
4079
4080 tcp_send_ack(sk);
4081 tcp_set_state(sk, TCP_CLOSING);
4082 break;
4083 case TCP_FIN_WAIT2:
4084
4085 tcp_send_ack(sk);
4086 tcp_time_wait(sk, TCP_TIME_WAIT, 0);
4087 break;
4088 default:
4089
4090
4091
4092 pr_err("%s: Impossible, sk->sk_state=%d\n",
4093 __func__, sk->sk_state);
4094 break;
4095 }
4096
4097
4098
4099
4100 __skb_queue_purge(&tp->out_of_order_queue);
4101 if (tcp_is_sack(tp))
4102 tcp_sack_reset(&tp->rx_opt);
4103 sk_mem_reclaim(sk);
4104
4105 if (!sock_flag(sk, SOCK_DEAD)) {
4106 sk->sk_state_change(sk);
4107
4108
4109 if (sk->sk_shutdown == SHUTDOWN_MASK ||
4110 sk->sk_state == TCP_CLOSE)
4111 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
4112 else
4113 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
4114 }
4115}
4116
4117static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
4118 u32 end_seq)
4119{
4120 if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
4121 if (before(seq, sp->start_seq))
4122 sp->start_seq = seq;
4123 if (after(end_seq, sp->end_seq))
4124 sp->end_seq = end_seq;
4125 return true;
4126 }
4127 return false;
4128}
4129
4130static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
4131{
4132 struct tcp_sock *tp = tcp_sk(sk);
4133
4134 if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
4135 int mib_idx;
4136
4137 if (before(seq, tp->rcv_nxt))
4138 mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
4139 else
4140 mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
4141
4142 NET_INC_STATS_BH(sock_net(sk), mib_idx);
4143
4144 tp->rx_opt.dsack = 1;
4145 tp->duplicate_sack[0].start_seq = seq;
4146 tp->duplicate_sack[0].end_seq = end_seq;
4147 }
4148}
4149
4150static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
4151{
4152 struct tcp_sock *tp = tcp_sk(sk);
4153
4154 if (!tp->rx_opt.dsack)
4155 tcp_dsack_set(sk, seq, end_seq);
4156 else
4157 tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
4158}
4159
4160static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
4161{
4162 struct tcp_sock *tp = tcp_sk(sk);
4163
4164 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
4165 before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4166 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4167 tcp_enter_quickack_mode(sk);
4168
4169 if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
4170 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
4171
4172 if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
4173 end_seq = tp->rcv_nxt;
4174 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
4175 }
4176 }
4177
4178 tcp_send_ack(sk);
4179}
4180
4181
4182
4183
4184static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
4185{
4186 int this_sack;
4187 struct tcp_sack_block *sp = &tp->selective_acks[0];
4188 struct tcp_sack_block *swalk = sp + 1;
4189
4190
4191
4192
4193 for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
4194 if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
4195 int i;
4196
4197
4198
4199
4200 tp->rx_opt.num_sacks--;
4201 for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
4202 sp[i] = sp[i + 1];
4203 continue;
4204 }
4205 this_sack++, swalk++;
4206 }
4207}
4208
4209static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
4210{
4211 struct tcp_sock *tp = tcp_sk(sk);
4212 struct tcp_sack_block *sp = &tp->selective_acks[0];
4213 int cur_sacks = tp->rx_opt.num_sacks;
4214 int this_sack;
4215
4216 if (!cur_sacks)
4217 goto new_sack;
4218
4219 for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
4220 if (tcp_sack_extend(sp, seq, end_seq)) {
4221
4222 for (; this_sack > 0; this_sack--, sp--)
4223 swap(*sp, *(sp - 1));
4224 if (cur_sacks > 1)
4225 tcp_sack_maybe_coalesce(tp);
4226 return;
4227 }
4228 }
4229
4230
4231
4232
4233
4234
4235
4236 if (this_sack >= TCP_NUM_SACKS) {
4237 this_sack--;
4238 tp->rx_opt.num_sacks--;
4239 sp--;
4240 }
4241 for (; this_sack > 0; this_sack--, sp--)
4242 *sp = *(sp - 1);
4243
4244new_sack:
4245
4246 sp->start_seq = seq;
4247 sp->end_seq = end_seq;
4248 tp->rx_opt.num_sacks++;
4249}
4250
4251
4252
4253static void tcp_sack_remove(struct tcp_sock *tp)
4254{
4255 struct tcp_sack_block *sp = &tp->selective_acks[0];
4256 int num_sacks = tp->rx_opt.num_sacks;
4257 int this_sack;
4258
4259
4260 if (skb_queue_empty(&tp->out_of_order_queue)) {
4261 tp->rx_opt.num_sacks = 0;
4262 return;
4263 }
4264
4265 for (this_sack = 0; this_sack < num_sacks;) {
4266
4267 if (!before(tp->rcv_nxt, sp->start_seq)) {
4268 int i;
4269
4270
4271 WARN_ON(before(tp->rcv_nxt, sp->end_seq));
4272
4273
4274 for (i = this_sack+1; i < num_sacks; i++)
4275 tp->selective_acks[i-1] = tp->selective_acks[i];
4276 num_sacks--;
4277 continue;
4278 }
4279 this_sack++;
4280 sp++;
4281 }
4282 tp->rx_opt.num_sacks = num_sacks;
4283}
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298static bool tcp_try_coalesce(struct sock *sk,
4299 struct sk_buff *to,
4300 struct sk_buff *from,
4301 bool *fragstolen)
4302{
4303 int delta;
4304
4305 *fragstolen = false;
4306
4307
4308 if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
4309 return false;
4310
4311 if (!skb_try_coalesce(to, from, fragstolen, &delta))
4312 return false;
4313
4314 atomic_add(delta, &sk->sk_rmem_alloc);
4315 sk_mem_charge(sk, delta);
4316 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
4317 TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
4318 TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
4319 TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
4320 return true;
4321}
4322
4323
4324
4325
4326static void tcp_ofo_queue(struct sock *sk)
4327{
4328 struct tcp_sock *tp = tcp_sk(sk);
4329 __u32 dsack_high = tp->rcv_nxt;
4330 struct sk_buff *skb, *tail;
4331 bool fragstolen, eaten;
4332
4333 while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
4334 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
4335 break;
4336
4337 if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
4338 __u32 dsack = dsack_high;
4339 if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
4340 dsack_high = TCP_SKB_CB(skb)->end_seq;
4341 tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
4342 }
4343
4344 __skb_unlink(skb, &tp->out_of_order_queue);
4345 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
4346 SOCK_DEBUG(sk, "ofo packet was already received\n");
4347 __kfree_skb(skb);
4348 continue;
4349 }
4350 SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
4351 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
4352 TCP_SKB_CB(skb)->end_seq);
4353
4354 tail = skb_peek_tail(&sk->sk_receive_queue);
4355 eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
4356 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
4357 if (!eaten)
4358 __skb_queue_tail(&sk->sk_receive_queue, skb);
4359 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
4360 tcp_fin(sk);
4361 if (eaten)
4362 kfree_skb_partial(skb, fragstolen);
4363 }
4364}
4365
4366static bool tcp_prune_ofo_queue(struct sock *sk);
4367static int tcp_prune_queue(struct sock *sk);
4368
4369static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4370 unsigned int size)
4371{
4372 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
4373 !sk_rmem_schedule(sk, skb, size)) {
4374
4375 if (tcp_prune_queue(sk) < 0)
4376 return -1;
4377
4378 if (!sk_rmem_schedule(sk, skb, size)) {
4379 if (!tcp_prune_ofo_queue(sk))
4380 return -1;
4381
4382 if (!sk_rmem_schedule(sk, skb, size))
4383 return -1;
4384 }
4385 }
4386 return 0;
4387}
4388
4389static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4390{
4391 struct tcp_sock *tp = tcp_sk(sk);
4392 struct sk_buff *skb1;
4393 u32 seq, end_seq;
4394
4395 tcp_ecn_check_ce(tp, skb);
4396
4397 if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
4398 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
4399 __kfree_skb(skb);
4400 return;
4401 }
4402
4403
4404 tp->pred_flags = 0;
4405 inet_csk_schedule_ack(sk);
4406
4407 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
4408 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
4409 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
4410
4411 skb1 = skb_peek_tail(&tp->out_of_order_queue);
4412 if (!skb1) {
4413
4414 if (tcp_is_sack(tp)) {
4415 tp->rx_opt.num_sacks = 1;
4416 tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
4417 tp->selective_acks[0].end_seq =
4418 TCP_SKB_CB(skb)->end_seq;
4419 }
4420 __skb_queue_head(&tp->out_of_order_queue, skb);
4421 goto end;
4422 }
4423
4424 seq = TCP_SKB_CB(skb)->seq;
4425 end_seq = TCP_SKB_CB(skb)->end_seq;
4426
4427 if (seq == TCP_SKB_CB(skb1)->end_seq) {
4428 bool fragstolen;
4429
4430 if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
4431 __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
4432 } else {
4433 tcp_grow_window(sk, skb);
4434 kfree_skb_partial(skb, fragstolen);
4435 skb = NULL;
4436 }
4437
4438 if (!tp->rx_opt.num_sacks ||
4439 tp->selective_acks[0].end_seq != seq)
4440 goto add_sack;
4441
4442
4443 tp->selective_acks[0].end_seq = end_seq;
4444 goto end;
4445 }
4446
4447
4448 while (1) {
4449 if (!after(TCP_SKB_CB(skb1)->seq, seq))
4450 break;
4451 if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
4452 skb1 = NULL;
4453 break;
4454 }
4455 skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
4456 }
4457
4458
4459 if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
4460 if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
4461
4462 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
4463 __kfree_skb(skb);
4464 skb = NULL;
4465 tcp_dsack_set(sk, seq, end_seq);
4466 goto add_sack;
4467 }
4468 if (after(seq, TCP_SKB_CB(skb1)->seq)) {
4469
4470 tcp_dsack_set(sk, seq,
4471 TCP_SKB_CB(skb1)->end_seq);
4472 } else {
4473 if (skb_queue_is_first(&tp->out_of_order_queue,
4474 skb1))
4475 skb1 = NULL;
4476 else
4477 skb1 = skb_queue_prev(
4478 &tp->out_of_order_queue,
4479 skb1);
4480 }
4481 }
4482 if (!skb1)
4483 __skb_queue_head(&tp->out_of_order_queue, skb);
4484 else
4485 __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
4486
4487
4488 while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
4489 skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
4490
4491 if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
4492 break;
4493 if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
4494 tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
4495 end_seq);
4496 break;
4497 }
4498 __skb_unlink(skb1, &tp->out_of_order_queue);
4499 tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
4500 TCP_SKB_CB(skb1)->end_seq);
4501 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
4502 __kfree_skb(skb1);
4503 }
4504
4505add_sack:
4506 if (tcp_is_sack(tp))
4507 tcp_sack_new_ofo_skb(sk, seq, end_seq);
4508end:
4509 if (skb) {
4510 tcp_grow_window(sk, skb);
4511 skb_set_owner_r(skb, sk);
4512 }
4513}
4514
4515static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
4516 bool *fragstolen)
4517{
4518 int eaten;
4519 struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
4520
4521 __skb_pull(skb, hdrlen);
4522 eaten = (tail &&
4523 tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
4524 tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
4525 if (!eaten) {
4526 __skb_queue_tail(&sk->sk_receive_queue, skb);
4527 skb_set_owner_r(skb, sk);
4528 }
4529 return eaten;
4530}
4531
4532int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
4533{
4534 struct sk_buff *skb;
4535 int err = -ENOMEM;
4536 int data_len = 0;
4537 bool fragstolen;
4538
4539 if (size == 0)
4540 return 0;
4541
4542 if (size > PAGE_SIZE) {
4543 int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);
4544
4545 data_len = npages << PAGE_SHIFT;
4546 size = data_len + (size & ~PAGE_MASK);
4547 }
4548 skb = alloc_skb_with_frags(size - data_len, data_len,
4549 PAGE_ALLOC_COSTLY_ORDER,
4550 &err, sk->sk_allocation);
4551 if (!skb)
4552 goto err;
4553
4554 skb_put(skb, size - data_len);
4555 skb->data_len = data_len;
4556 skb->len = size;
4557
4558 if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
4559 goto err_free;
4560
4561 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
4562 if (err)
4563 goto err_free;
4564
4565 TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
4566 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
4567 TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
4568
4569 if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
4570 WARN_ON_ONCE(fragstolen);
4571 __kfree_skb(skb);
4572 }
4573 return size;
4574
4575err_free:
4576 kfree_skb(skb);
4577err:
4578 return err;
4579
4580}
4581
4582static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4583{
4584 struct tcp_sock *tp = tcp_sk(sk);
4585 int eaten = -1;
4586 bool fragstolen = false;
4587
4588 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
4589 goto drop;
4590
4591 skb_dst_drop(skb);
4592 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
4593
4594 tcp_ecn_accept_cwr(tp, skb);
4595
4596 tp->rx_opt.dsack = 0;
4597
4598
4599
4600
4601
4602 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
4603 if (tcp_receive_window(tp) == 0)
4604 goto out_of_window;
4605
4606
4607 if (tp->ucopy.task == current &&
4608 tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
4609 sock_owned_by_user(sk) && !tp->urg_data) {
4610 int chunk = min_t(unsigned int, skb->len,
4611 tp->ucopy.len);
4612
4613 __set_current_state(TASK_RUNNING);
4614
4615 local_bh_enable();
4616 if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) {
4617 tp->ucopy.len -= chunk;
4618 tp->copied_seq += chunk;
4619 eaten = (chunk == skb->len);
4620 tcp_rcv_space_adjust(sk);
4621 }
4622 local_bh_disable();
4623 }
4624
4625 if (eaten <= 0) {
4626queue_and_out:
4627 if (eaten < 0) {
4628 if (skb_queue_len(&sk->sk_receive_queue) == 0)
4629 sk_forced_mem_schedule(sk, skb->truesize);
4630 else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
4631 goto drop;
4632 }
4633 eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
4634 }
4635 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
4636 if (skb->len)
4637 tcp_event_data_recv(sk, skb);
4638 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
4639 tcp_fin(sk);
4640
4641 if (!skb_queue_empty(&tp->out_of_order_queue)) {
4642 tcp_ofo_queue(sk);
4643
4644
4645
4646
4647 if (skb_queue_empty(&tp->out_of_order_queue))
4648 inet_csk(sk)->icsk_ack.pingpong = 0;
4649 }
4650
4651 if (tp->rx_opt.num_sacks)
4652 tcp_sack_remove(tp);
4653
4654 tcp_fast_path_check(sk);
4655
4656 if (eaten > 0)
4657 kfree_skb_partial(skb, fragstolen);
4658 if (!sock_flag(sk, SOCK_DEAD))
4659 sk->sk_data_ready(sk);
4660 return;
4661 }
4662
4663 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
4664
4665 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4666 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
4667
4668out_of_window:
4669 tcp_enter_quickack_mode(sk);
4670 inet_csk_schedule_ack(sk);
4671drop:
4672 __kfree_skb(skb);
4673 return;
4674 }
4675
4676
4677 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
4678 goto out_of_window;
4679
4680 tcp_enter_quickack_mode(sk);
4681
4682 if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4683
4684 SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
4685 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
4686 TCP_SKB_CB(skb)->end_seq);
4687
4688 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
4689
4690
4691
4692
4693 if (!tcp_receive_window(tp))
4694 goto out_of_window;
4695 goto queue_and_out;
4696 }
4697
4698 tcp_data_queue_ofo(sk, skb);
4699}
4700
4701static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
4702 struct sk_buff_head *list)
4703{
4704 struct sk_buff *next = NULL;
4705
4706 if (!skb_queue_is_last(list, skb))
4707 next = skb_queue_next(list, skb);
4708
4709 __skb_unlink(skb, list);
4710 __kfree_skb(skb);
4711 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
4712
4713 return next;
4714}
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724static void
4725tcp_collapse(struct sock *sk, struct sk_buff_head *list,
4726 struct sk_buff *head, struct sk_buff *tail,
4727 u32 start, u32 end)
4728{
4729 struct sk_buff *skb, *n;
4730 bool end_of_skbs;
4731
4732
4733
4734 skb = head;
4735restart:
4736 end_of_skbs = true;
4737 skb_queue_walk_from_safe(list, skb, n) {
4738 if (skb == tail)
4739 break;
4740
4741 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
4742 skb = tcp_collapse_one(sk, skb, list);
4743 if (!skb)
4744 break;
4745 goto restart;
4746 }
4747
4748
4749
4750
4751
4752
4753 if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
4754 (tcp_win_from_space(skb->truesize) > skb->len ||
4755 before(TCP_SKB_CB(skb)->seq, start))) {
4756 end_of_skbs = false;
4757 break;
4758 }
4759
4760 if (!skb_queue_is_last(list, skb)) {
4761 struct sk_buff *next = skb_queue_next(list, skb);
4762 if (next != tail &&
4763 TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
4764 end_of_skbs = false;
4765 break;
4766 }
4767 }
4768
4769
4770 start = TCP_SKB_CB(skb)->end_seq;
4771 }
4772 if (end_of_skbs ||
4773 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4774 return;
4775
4776 while (before(start, end)) {
4777 int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
4778 struct sk_buff *nskb;
4779
4780 nskb = alloc_skb(copy, GFP_ATOMIC);
4781 if (!nskb)
4782 return;
4783
4784 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
4785 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
4786 __skb_queue_before(list, skb, nskb);
4787 skb_set_owner_r(nskb, sk);
4788
4789
4790 while (copy > 0) {
4791 int offset = start - TCP_SKB_CB(skb)->seq;
4792 int size = TCP_SKB_CB(skb)->end_seq - start;
4793
4794 BUG_ON(offset < 0);
4795 if (size > 0) {
4796 size = min(copy, size);
4797 if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
4798 BUG();
4799 TCP_SKB_CB(nskb)->end_seq += size;
4800 copy -= size;
4801 start += size;
4802 }
4803 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
4804 skb = tcp_collapse_one(sk, skb, list);
4805 if (!skb ||
4806 skb == tail ||
4807 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4808 return;
4809 }
4810 }
4811 }
4812}
4813
4814
4815
4816
4817static void tcp_collapse_ofo_queue(struct sock *sk)
4818{
4819 struct tcp_sock *tp = tcp_sk(sk);
4820 struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
4821 struct sk_buff *head;
4822 u32 start, end;
4823
4824 if (!skb)
4825 return;
4826
4827 start = TCP_SKB_CB(skb)->seq;
4828 end = TCP_SKB_CB(skb)->end_seq;
4829 head = skb;
4830
4831 for (;;) {
4832 struct sk_buff *next = NULL;
4833
4834 if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
4835 next = skb_queue_next(&tp->out_of_order_queue, skb);
4836 skb = next;
4837
4838
4839
4840 if (!skb ||
4841 after(TCP_SKB_CB(skb)->seq, end) ||
4842 before(TCP_SKB_CB(skb)->end_seq, start)) {
4843 tcp_collapse(sk, &tp->out_of_order_queue,
4844 head, skb, start, end);
4845 head = skb;
4846 if (!skb)
4847 break;
4848
4849 start = TCP_SKB_CB(skb)->seq;
4850 end = TCP_SKB_CB(skb)->end_seq;
4851 } else {
4852 if (before(TCP_SKB_CB(skb)->seq, start))
4853 start = TCP_SKB_CB(skb)->seq;
4854 if (after(TCP_SKB_CB(skb)->end_seq, end))
4855 end = TCP_SKB_CB(skb)->end_seq;
4856 }
4857 }
4858}
4859
4860
4861
4862
4863
4864static bool tcp_prune_ofo_queue(struct sock *sk)
4865{
4866 struct tcp_sock *tp = tcp_sk(sk);
4867 bool res = false;
4868
4869 if (!skb_queue_empty(&tp->out_of_order_queue)) {
4870 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
4871 __skb_queue_purge(&tp->out_of_order_queue);
4872
4873
4874
4875
4876
4877
4878 if (tp->rx_opt.sack_ok)
4879 tcp_sack_reset(&tp->rx_opt);
4880 sk_mem_reclaim(sk);
4881 res = true;
4882 }
4883 return res;
4884}
4885
4886
4887
4888
4889
4890
4891
4892
4893static int tcp_prune_queue(struct sock *sk)
4894{
4895 struct tcp_sock *tp = tcp_sk(sk);
4896
4897 SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
4898
4899 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PRUNECALLED);
4900
4901 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
4902 tcp_clamp_window(sk);
4903 else if (tcp_under_memory_pressure(sk))
4904 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
4905
4906 tcp_collapse_ofo_queue(sk);
4907 if (!skb_queue_empty(&sk->sk_receive_queue))
4908 tcp_collapse(sk, &sk->sk_receive_queue,
4909 skb_peek(&sk->sk_receive_queue),
4910 NULL,
4911 tp->copied_seq, tp->rcv_nxt);
4912 sk_mem_reclaim(sk);
4913
4914 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
4915 return 0;
4916
4917
4918
4919
4920 tcp_prune_ofo_queue(sk);
4921
4922 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
4923 return 0;
4924
4925
4926
4927
4928
4929 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_RCVPRUNED);
4930
4931
4932 tp->pred_flags = 0;
4933 return -1;
4934}
4935
4936static bool tcp_should_expand_sndbuf(const struct sock *sk)
4937{
4938 const struct tcp_sock *tp = tcp_sk(sk);
4939
4940
4941
4942
4943 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
4944 return false;
4945
4946
4947 if (tcp_under_memory_pressure(sk))
4948 return false;
4949
4950
4951 if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
4952 return false;
4953
4954
4955 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
4956 return false;
4957
4958 return true;
4959}
4960
4961
4962
4963
4964
4965
4966
4967static void tcp_new_space(struct sock *sk)
4968{
4969 struct tcp_sock *tp = tcp_sk(sk);
4970
4971 if (tcp_should_expand_sndbuf(sk)) {
4972 tcp_sndbuf_expand(sk);
4973 tp->snd_cwnd_stamp = tcp_time_stamp;
4974 }
4975
4976 sk->sk_write_space(sk);
4977}
4978
4979static void tcp_check_space(struct sock *sk)
4980{
4981 if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
4982 sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
4983
4984 smp_mb__after_atomic();
4985 if (sk->sk_socket &&
4986 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
4987 tcp_new_space(sk);
4988 }
4989}
4990
4991static inline void tcp_data_snd_check(struct sock *sk)
4992{
4993 tcp_push_pending_frames(sk);
4994 tcp_check_space(sk);
4995}
4996
4997
4998
4999
5000static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
5001{
5002 struct tcp_sock *tp = tcp_sk(sk);
5003
5004
5005 if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
5006
5007
5008
5009 __tcp_select_window(sk) >= tp->rcv_wnd) ||
5010
5011 tcp_in_quickack_mode(sk) ||
5012
5013 (ofo_possible && skb_peek(&tp->out_of_order_queue))) {
5014
5015 tcp_send_ack(sk);
5016 } else {
5017
5018 tcp_send_delayed_ack(sk);
5019 }
5020}
5021
5022static inline void tcp_ack_snd_check(struct sock *sk)
5023{
5024 if (!inet_csk_ack_scheduled(sk)) {
5025
5026 return;
5027 }
5028 __tcp_ack_snd_check(sk, 1);
5029}
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
5042{
5043 struct tcp_sock *tp = tcp_sk(sk);
5044 u32 ptr = ntohs(th->urg_ptr);
5045
5046 if (ptr && !sysctl_tcp_stdurg)
5047 ptr--;
5048 ptr += ntohl(th->seq);
5049
5050
5051 if (after(tp->copied_seq, ptr))
5052 return;
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064 if (before(ptr, tp->rcv_nxt))
5065 return;
5066
5067
5068 if (tp->urg_data && !after(ptr, tp->urg_seq))
5069 return;
5070
5071
5072 sk_send_sigurg(sk);
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089 if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
5090 !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
5091 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
5092 tp->copied_seq++;
5093 if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
5094 __skb_unlink(skb, &sk->sk_receive_queue);
5095 __kfree_skb(skb);
5096 }
5097 }
5098
5099 tp->urg_data = TCP_URG_NOTYET;
5100 tp->urg_seq = ptr;
5101
5102
5103 tp->pred_flags = 0;
5104}
5105
5106
5107static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
5108{
5109 struct tcp_sock *tp = tcp_sk(sk);
5110
5111
5112 if (th->urg)
5113 tcp_check_urg(sk, th);
5114
5115
5116 if (tp->urg_data == TCP_URG_NOTYET) {
5117 u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
5118 th->syn;
5119
5120
5121 if (ptr < skb->len) {
5122 u8 tmp;
5123 if (skb_copy_bits(skb, ptr, &tmp, 1))
5124 BUG();
5125 tp->urg_data = TCP_URG_VALID | tmp;
5126 if (!sock_flag(sk, SOCK_DEAD))
5127 sk->sk_data_ready(sk);
5128 }
5129 }
5130}
5131
5132static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
5133{
5134 struct tcp_sock *tp = tcp_sk(sk);
5135 int chunk = skb->len - hlen;
5136 int err;
5137
5138 local_bh_enable();
5139 if (skb_csum_unnecessary(skb))
5140 err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk);
5141 else
5142 err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg);
5143
5144 if (!err) {
5145 tp->ucopy.len -= chunk;
5146 tp->copied_seq += chunk;
5147 tcp_rcv_space_adjust(sk);
5148 }
5149
5150 local_bh_disable();
5151 return err;
5152}
5153
5154static __sum16 __tcp_checksum_complete_user(struct sock *sk,
5155 struct sk_buff *skb)
5156{
5157 __sum16 result;
5158
5159 if (sock_owned_by_user(sk)) {
5160 local_bh_enable();
5161 result = __tcp_checksum_complete(skb);
5162 local_bh_disable();
5163 } else {
5164 result = __tcp_checksum_complete(skb);
5165 }
5166 return result;
5167}
5168
5169static inline bool tcp_checksum_complete_user(struct sock *sk,
5170 struct sk_buff *skb)
5171{
5172 return !skb_csum_unnecessary(skb) &&
5173 __tcp_checksum_complete_user(sk, skb);
5174}
5175
5176
5177
5178
5179static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
5180 const struct tcphdr *th, int syn_inerr)
5181{
5182 struct tcp_sock *tp = tcp_sk(sk);
5183
5184
5185 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
5186 tcp_paws_discard(sk, skb)) {
5187 if (!th->rst) {
5188 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
5189 if (!tcp_oow_rate_limited(sock_net(sk), skb,
5190 LINUX_MIB_TCPACKSKIPPEDPAWS,
5191 &tp->last_oow_ack_time))
5192 tcp_send_dupack(sk, skb);
5193 goto discard;
5194 }
5195
5196 }
5197
5198
5199 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
5200
5201
5202
5203
5204
5205
5206 if (!th->rst) {
5207 if (th->syn)
5208 goto syn_challenge;
5209 if (!tcp_oow_rate_limited(sock_net(sk), skb,
5210 LINUX_MIB_TCPACKSKIPPEDSEQ,
5211 &tp->last_oow_ack_time))
5212 tcp_send_dupack(sk, skb);
5213 }
5214 goto discard;
5215 }
5216
5217
5218 if (th->rst) {
5219
5220
5221
5222
5223
5224
5225 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
5226 tcp_reset(sk);
5227 else
5228 tcp_send_challenge_ack(sk, skb);
5229 goto discard;
5230 }
5231
5232
5233
5234
5235
5236
5237 if (th->syn) {
5238syn_challenge:
5239 if (syn_inerr)
5240 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
5241 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
5242 tcp_send_challenge_ack(sk, skb);
5243 goto discard;
5244 }
5245
5246 return true;
5247
5248discard:
5249 __kfree_skb(skb);
5250 return false;
5251}
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5277 const struct tcphdr *th, unsigned int len)
5278{
5279 struct tcp_sock *tp = tcp_sk(sk);
5280
5281 if (unlikely(!sk->sk_rx_dst))
5282 inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298 tp->rx_opt.saw_tstamp = 0;
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309 if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
5310 TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
5311 !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
5312 int tcp_header_len = tp->tcp_header_len;
5313
5314
5315
5316
5317
5318
5319
5320 if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
5321
5322 if (!tcp_parse_aligned_timestamp(tp, th))
5323 goto slow_path;
5324
5325
5326 if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
5327 goto slow_path;
5328
5329
5330
5331
5332
5333
5334 }
5335
5336 if (len <= tcp_header_len) {
5337
5338 if (len == tcp_header_len) {
5339
5340
5341
5342
5343 if (tcp_header_len ==
5344 (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
5345 tp->rcv_nxt == tp->rcv_wup)
5346 tcp_store_ts_recent(tp);
5347
5348
5349
5350
5351 tcp_ack(sk, skb, 0);
5352 __kfree_skb(skb);
5353 tcp_data_snd_check(sk);
5354 return;
5355 } else {
5356 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
5357 goto discard;
5358 }
5359 } else {
5360 int eaten = 0;
5361 bool fragstolen = false;
5362
5363 if (tp->ucopy.task == current &&
5364 tp->copied_seq == tp->rcv_nxt &&
5365 len - tcp_header_len <= tp->ucopy.len &&
5366 sock_owned_by_user(sk)) {
5367 __set_current_state(TASK_RUNNING);
5368
5369 if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
5370
5371
5372
5373
5374 if (tcp_header_len ==
5375 (sizeof(struct tcphdr) +
5376 TCPOLEN_TSTAMP_ALIGNED) &&
5377 tp->rcv_nxt == tp->rcv_wup)
5378 tcp_store_ts_recent(tp);
5379
5380 tcp_rcv_rtt_measure_ts(sk, skb);
5381
5382 __skb_pull(skb, tcp_header_len);
5383 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
5384 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
5385 eaten = 1;
5386 }
5387 }
5388 if (!eaten) {
5389 if (tcp_checksum_complete_user(sk, skb))
5390 goto csum_error;
5391
5392 if ((int)skb->truesize > sk->sk_forward_alloc)
5393 goto step5;
5394
5395
5396
5397
5398
5399 if (tcp_header_len ==
5400 (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
5401 tp->rcv_nxt == tp->rcv_wup)
5402 tcp_store_ts_recent(tp);
5403
5404 tcp_rcv_rtt_measure_ts(sk, skb);
5405
5406 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
5407
5408
5409 eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
5410 &fragstolen);
5411 }
5412
5413 tcp_event_data_recv(sk, skb);
5414
5415 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
5416
5417 tcp_ack(sk, skb, FLAG_DATA);
5418 tcp_data_snd_check(sk);
5419 if (!inet_csk_ack_scheduled(sk))
5420 goto no_ack;
5421 }
5422
5423 __tcp_ack_snd_check(sk, 0);
5424no_ack:
5425 if (eaten)
5426 kfree_skb_partial(skb, fragstolen);
5427 sk->sk_data_ready(sk);
5428 return;
5429 }
5430 }
5431
5432slow_path:
5433 if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
5434 goto csum_error;
5435
5436 if (!th->ack && !th->rst && !th->syn)
5437 goto discard;
5438
5439
5440
5441
5442
5443 if (!tcp_validate_incoming(sk, skb, th, 1))
5444 return;
5445
5446step5:
5447 if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
5448 goto discard;
5449
5450 tcp_rcv_rtt_measure_ts(sk, skb);
5451
5452
5453 tcp_urg(sk, skb, th);
5454
5455
5456 tcp_data_queue(sk, skb);
5457
5458 tcp_data_snd_check(sk);
5459 tcp_ack_snd_check(sk);
5460 return;
5461
5462csum_error:
5463 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
5464 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
5465
5466discard:
5467 __kfree_skb(skb);
5468}
5469EXPORT_SYMBOL(tcp_rcv_established);
5470
5471void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5472{
5473 struct tcp_sock *tp = tcp_sk(sk);
5474 struct inet_connection_sock *icsk = inet_csk(sk);
5475
5476 tcp_set_state(sk, TCP_ESTABLISHED);
5477
5478 if (skb) {
5479 icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
5480 security_inet_conn_established(sk, skb);
5481 }
5482
5483
5484 icsk->icsk_af_ops->rebuild_header(sk);
5485
5486 tcp_init_metrics(sk);
5487
5488 tcp_init_congestion_control(sk);
5489
5490
5491
5492
5493 tp->lsndtime = tcp_time_stamp;
5494
5495 tcp_init_buffer_space(sk);
5496
5497 if (sock_flag(sk, SOCK_KEEPOPEN))
5498 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
5499
5500 if (!tp->rx_opt.snd_wscale)
5501 __tcp_fast_path_on(tp, tp->snd_wnd);
5502 else
5503 tp->pred_flags = 0;
5504
5505 if (!sock_flag(sk, SOCK_DEAD)) {
5506 sk->sk_state_change(sk);
5507 sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
5508 }
5509}
5510
5511static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5512 struct tcp_fastopen_cookie *cookie)
5513{
5514 struct tcp_sock *tp = tcp_sk(sk);
5515 struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
5516 u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
5517 bool syn_drop = false;
5518
5519 if (mss == tp->rx_opt.user_mss) {
5520 struct tcp_options_received opt;
5521
5522
5523 tcp_clear_options(&opt);
5524 opt.user_mss = opt.mss_clamp = 0;
5525 tcp_parse_options(synack, &opt, 0, NULL);
5526 mss = opt.mss_clamp;
5527 }
5528
5529 if (!tp->syn_fastopen) {
5530
5531 cookie->len = -1;
5532 } else if (tp->total_retrans) {
5533
5534
5535
5536
5537
5538 syn_drop = (cookie->len < 0 && data);
5539 } else if (cookie->len < 0 && !tp->syn_data) {
5540
5541
5542
5543
5544 try_exp = tp->syn_fastopen_exp ? 2 : 1;
5545 }
5546
5547 tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
5548
5549 if (data) {
5550 tcp_for_write_queue_from(data, sk) {
5551 if (data == tcp_send_head(sk) ||
5552 __tcp_retransmit_skb(sk, data))
5553 break;
5554 }
5555 tcp_rearm_rto(sk);
5556 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
5557 return true;
5558 }
5559 tp->syn_data_acked = tp->syn_data;
5560 if (tp->syn_data_acked)
5561 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
5562
5563 tcp_fastopen_add_skb(sk, synack);
5564
5565 return false;
5566}
5567
5568static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5569 const struct tcphdr *th)
5570{
5571 struct inet_connection_sock *icsk = inet_csk(sk);
5572 struct tcp_sock *tp = tcp_sk(sk);
5573 struct tcp_fastopen_cookie foc = { .len = -1 };
5574 int saved_clamp = tp->rx_opt.mss_clamp;
5575
5576 tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
5577 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
5578 tp->rx_opt.rcv_tsecr -= tp->tsoffset;
5579
5580 if (th->ack) {
5581
5582
5583
5584
5585
5586
5587
5588
5589 if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
5590 after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
5591 goto reset_and_undo;
5592
5593 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
5594 !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
5595 tcp_time_stamp)) {
5596 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED);
5597 goto reset_and_undo;
5598 }
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608 if (th->rst) {
5609 tcp_reset(sk);
5610 goto discard;
5611 }
5612
5613
5614
5615
5616
5617
5618
5619
5620 if (!th->syn)
5621 goto discard_and_undo;
5622
5623
5624
5625
5626
5627
5628
5629
5630 tcp_ecn_rcv_synack(tp, th);
5631
5632 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5633 tcp_ack(sk, skb, FLAG_SLOWPATH);
5634
5635
5636
5637
5638 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
5639 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
5640
5641
5642
5643
5644 tp->snd_wnd = ntohs(th->window);
5645
5646 if (!tp->rx_opt.wscale_ok) {
5647 tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
5648 tp->window_clamp = min(tp->window_clamp, 65535U);
5649 }
5650
5651 if (tp->rx_opt.saw_tstamp) {
5652 tp->rx_opt.tstamp_ok = 1;
5653 tp->tcp_header_len =
5654 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
5655 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
5656 tcp_store_ts_recent(tp);
5657 } else {
5658 tp->tcp_header_len = sizeof(struct tcphdr);
5659 }
5660
5661 if (tcp_is_sack(tp) && sysctl_tcp_fack)
5662 tcp_enable_fack(tp);
5663
5664 tcp_mtup_init(sk);
5665 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
5666 tcp_initialize_rcv_mss(sk);
5667
5668
5669
5670
5671 tp->copied_seq = tp->rcv_nxt;
5672
5673 smp_mb();
5674
5675 tcp_finish_connect(sk, skb);
5676
5677 if ((tp->syn_fastopen || tp->syn_data) &&
5678 tcp_rcv_fastopen_synack(sk, skb, &foc))
5679 return -1;
5680
5681 if (sk->sk_write_pending ||
5682 icsk->icsk_accept_queue.rskq_defer_accept ||
5683 icsk->icsk_ack.pingpong) {
5684
5685
5686
5687
5688
5689
5690
5691 inet_csk_schedule_ack(sk);
5692 icsk->icsk_ack.lrcvtime = tcp_time_stamp;
5693 tcp_enter_quickack_mode(sk);
5694 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
5695 TCP_DELACK_MAX, TCP_RTO_MAX);
5696
5697discard:
5698 __kfree_skb(skb);
5699 return 0;
5700 } else {
5701 tcp_send_ack(sk);
5702 }
5703 return -1;
5704 }
5705
5706
5707
5708 if (th->rst) {
5709
5710
5711
5712
5713
5714
5715 goto discard_and_undo;
5716 }
5717
5718
5719 if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
5720 tcp_paws_reject(&tp->rx_opt, 0))
5721 goto discard_and_undo;
5722
5723 if (th->syn) {
5724
5725
5726
5727
5728 tcp_set_state(sk, TCP_SYN_RECV);
5729
5730 if (tp->rx_opt.saw_tstamp) {
5731 tp->rx_opt.tstamp_ok = 1;
5732 tcp_store_ts_recent(tp);
5733 tp->tcp_header_len =
5734 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
5735 } else {
5736 tp->tcp_header_len = sizeof(struct tcphdr);
5737 }
5738
5739 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
5740 tp->copied_seq = tp->rcv_nxt;
5741 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
5742
5743
5744
5745
5746 tp->snd_wnd = ntohs(th->window);
5747 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
5748 tp->max_window = tp->snd_wnd;
5749
5750 tcp_ecn_rcv_syn(tp, th);
5751
5752 tcp_mtup_init(sk);
5753 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
5754 tcp_initialize_rcv_mss(sk);
5755
5756 tcp_send_synack(sk);
5757#if 0
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769 return -1;
5770#else
5771 goto discard;
5772#endif
5773 }
5774
5775
5776
5777
5778discard_and_undo:
5779 tcp_clear_options(&tp->rx_opt);
5780 tp->rx_opt.mss_clamp = saved_clamp;
5781 goto discard;
5782
5783reset_and_undo:
5784 tcp_clear_options(&tp->rx_opt);
5785 tp->rx_opt.mss_clamp = saved_clamp;
5786 return 1;
5787}
5788
5789
5790
5791
5792
5793
5794
5795
5796int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
5797{
5798 struct tcp_sock *tp = tcp_sk(sk);
5799 struct inet_connection_sock *icsk = inet_csk(sk);
5800 const struct tcphdr *th = tcp_hdr(skb);
5801 struct request_sock *req;
5802 int queued = 0;
5803 bool acceptable;
5804
5805 tp->rx_opt.saw_tstamp = 0;
5806
5807 switch (sk->sk_state) {
5808 case TCP_CLOSE:
5809 goto discard;
5810
5811 case TCP_LISTEN:
5812 if (th->ack)
5813 return 1;
5814
5815 if (th->rst)
5816 goto discard;
5817
5818 if (th->syn) {
5819 if (th->fin)
5820 goto discard;
5821 if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
5822 return 1;
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841 kfree_skb(skb);
5842 return 0;
5843 }
5844 goto discard;
5845
5846 case TCP_SYN_SENT:
5847 queued = tcp_rcv_synsent_state_process(sk, skb, th);
5848 if (queued >= 0)
5849 return queued;
5850
5851
5852 tcp_urg(sk, skb, th);
5853 __kfree_skb(skb);
5854 tcp_data_snd_check(sk);
5855 return 0;
5856 }
5857
5858 req = tp->fastopen_rsk;
5859 if (req) {
5860 WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
5861 sk->sk_state != TCP_FIN_WAIT1);
5862
5863 if (!tcp_check_req(sk, skb, req, true))
5864 goto discard;
5865 }
5866
5867 if (!th->ack && !th->rst && !th->syn)
5868 goto discard;
5869
5870 if (!tcp_validate_incoming(sk, skb, th, 0))
5871 return 0;
5872
5873
5874 acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
5875 FLAG_UPDATE_TS_RECENT) > 0;
5876
5877 switch (sk->sk_state) {
5878 case TCP_SYN_RECV:
5879 if (!acceptable)
5880 return 1;
5881
5882 if (!tp->srtt_us)
5883 tcp_synack_rtt_meas(sk, req);
5884
5885
5886
5887
5888 if (req) {
5889 tp->total_retrans = req->num_retrans;
5890 reqsk_fastopen_remove(sk, req, false);
5891 } else {
5892
5893 icsk->icsk_af_ops->rebuild_header(sk);
5894 tcp_init_congestion_control(sk);
5895
5896 tcp_mtup_init(sk);
5897 tp->copied_seq = tp->rcv_nxt;
5898 tcp_init_buffer_space(sk);
5899 }
5900 smp_mb();
5901 tcp_set_state(sk, TCP_ESTABLISHED);
5902 sk->sk_state_change(sk);
5903
5904
5905
5906
5907
5908 if (sk->sk_socket)
5909 sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
5910
5911 tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
5912 tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
5913 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5914
5915 if (tp->rx_opt.tstamp_ok)
5916 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
5917
5918 if (req) {
5919
5920
5921
5922
5923
5924
5925
5926
5927 tcp_rearm_rto(sk);
5928 } else
5929 tcp_init_metrics(sk);
5930
5931 tcp_update_pacing_rate(sk);
5932
5933
5934 tp->lsndtime = tcp_time_stamp;
5935
5936 tcp_initialize_rcv_mss(sk);
5937 tcp_fast_path_on(tp);
5938 break;
5939
5940 case TCP_FIN_WAIT1: {
5941 struct dst_entry *dst;
5942 int tmo;
5943
5944
5945
5946
5947
5948
5949 if (req) {
5950
5951
5952
5953
5954
5955
5956 if (!acceptable)
5957 return 1;
5958
5959 reqsk_fastopen_remove(sk, req, false);
5960 tcp_rearm_rto(sk);
5961 }
5962 if (tp->snd_una != tp->write_seq)
5963 break;
5964
5965 tcp_set_state(sk, TCP_FIN_WAIT2);
5966 sk->sk_shutdown |= SEND_SHUTDOWN;
5967
5968 dst = __sk_dst_get(sk);
5969 if (dst)
5970 dst_confirm(dst);
5971
5972 if (!sock_flag(sk, SOCK_DEAD)) {
5973
5974 sk->sk_state_change(sk);
5975 break;
5976 }
5977
5978 if (tp->linger2 < 0 ||
5979 (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
5980 after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
5981 tcp_done(sk);
5982 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
5983 return 1;
5984 }
5985
5986 tmo = tcp_fin_time(sk);
5987 if (tmo > TCP_TIMEWAIT_LEN) {
5988 inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
5989 } else if (th->fin || sock_owned_by_user(sk)) {
5990
5991
5992
5993
5994
5995
5996 inet_csk_reset_keepalive_timer(sk, tmo);
5997 } else {
5998 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
5999 goto discard;
6000 }
6001 break;
6002 }
6003
6004 case TCP_CLOSING:
6005 if (tp->snd_una == tp->write_seq) {
6006 tcp_time_wait(sk, TCP_TIME_WAIT, 0);
6007 goto discard;
6008 }
6009 break;
6010
6011 case TCP_LAST_ACK:
6012 if (tp->snd_una == tp->write_seq) {
6013 tcp_update_metrics(sk);
6014 tcp_done(sk);
6015 goto discard;
6016 }
6017 break;
6018 }
6019
6020
6021 tcp_urg(sk, skb, th);
6022
6023
6024 switch (sk->sk_state) {
6025 case TCP_CLOSE_WAIT:
6026 case TCP_CLOSING:
6027 case TCP_LAST_ACK:
6028 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
6029 break;
6030 case TCP_FIN_WAIT1:
6031 case TCP_FIN_WAIT2:
6032
6033
6034
6035
6036 if (sk->sk_shutdown & RCV_SHUTDOWN) {
6037 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
6038 after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
6039 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
6040 tcp_reset(sk);
6041 return 1;
6042 }
6043 }
6044
6045 case TCP_ESTABLISHED:
6046 tcp_data_queue(sk, skb);
6047 queued = 1;
6048 break;
6049 }
6050
6051
6052 if (sk->sk_state != TCP_CLOSE) {
6053 tcp_data_snd_check(sk);
6054 tcp_ack_snd_check(sk);
6055 }
6056
6057 if (!queued) {
6058discard:
6059 __kfree_skb(skb);
6060 }
6061 return 0;
6062}
6063EXPORT_SYMBOL(tcp_rcv_state_process);
6064
6065static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
6066{
6067 struct inet_request_sock *ireq = inet_rsk(req);
6068
6069 if (family == AF_INET)
6070 net_dbg_ratelimited("drop open request from %pI4/%u\n",
6071 &ireq->ir_rmt_addr, port);
6072#if IS_ENABLED(CONFIG_IPV6)
6073 else if (family == AF_INET6)
6074 net_dbg_ratelimited("drop open request from %pI6/%u\n",
6075 &ireq->ir_v6_rmt_addr, port);
6076#endif
6077}
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091static void tcp_ecn_create_request(struct request_sock *req,
6092 const struct sk_buff *skb,
6093 const struct sock *listen_sk,
6094 const struct dst_entry *dst)
6095{
6096 const struct tcphdr *th = tcp_hdr(skb);
6097 const struct net *net = sock_net(listen_sk);
6098 bool th_ecn = th->ece && th->cwr;
6099 bool ect, ecn_ok;
6100 u32 ecn_ok_dst;
6101
6102 if (!th_ecn)
6103 return;
6104
6105 ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
6106 ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
6107 ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
6108
6109 if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
6110 (ecn_ok_dst & DST_FEATURE_ECN_CA))
6111 inet_rsk(req)->ecn_ok = 1;
6112}
6113
6114static void tcp_openreq_init(struct request_sock *req,
6115 const struct tcp_options_received *rx_opt,
6116 struct sk_buff *skb, const struct sock *sk)
6117{
6118 struct inet_request_sock *ireq = inet_rsk(req);
6119
6120 req->rsk_rcv_wnd = 0;
6121 req->cookie_ts = 0;
6122 tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
6123 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
6124 skb_mstamp_get(&tcp_rsk(req)->snt_synack);
6125 tcp_rsk(req)->last_oow_ack_time = 0;
6126 req->mss = rx_opt->mss_clamp;
6127 req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
6128 ireq->tstamp_ok = rx_opt->tstamp_ok;
6129 ireq->sack_ok = rx_opt->sack_ok;
6130 ireq->snd_wscale = rx_opt->snd_wscale;
6131 ireq->wscale_ok = rx_opt->wscale_ok;
6132 ireq->acked = 0;
6133 ireq->ecn_ok = 0;
6134 ireq->ir_rmt_port = tcp_hdr(skb)->source;
6135 ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
6136 ireq->ir_mark = inet_request_mark(sk, skb);
6137}
6138
6139struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
6140 struct sock *sk_listener,
6141 bool attach_listener)
6142{
6143 struct request_sock *req = reqsk_alloc(ops, sk_listener,
6144 attach_listener);
6145
6146 if (req) {
6147 struct inet_request_sock *ireq = inet_rsk(req);
6148
6149 kmemcheck_annotate_bitfield(ireq, flags);
6150 ireq->opt = NULL;
6151 atomic64_set(&ireq->ir_cookie, 0);
6152 ireq->ireq_state = TCP_NEW_SYN_RECV;
6153 write_pnet(&ireq->ireq_net, sock_net(sk_listener));
6154 ireq->ireq_family = sk_listener->sk_family;
6155 }
6156
6157 return req;
6158}
6159EXPORT_SYMBOL(inet_reqsk_alloc);
6160
6161
6162
6163
6164static bool tcp_syn_flood_action(const struct sock *sk,
6165 const struct sk_buff *skb,
6166 const char *proto)
6167{
6168 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
6169 const char *msg = "Dropping request";
6170 bool want_cookie = false;
6171 struct net *net = sock_net(sk);
6172
6173#ifdef CONFIG_SYN_COOKIES
6174 if (net->ipv4.sysctl_tcp_syncookies) {
6175 msg = "Sending cookies";
6176 want_cookie = true;
6177 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
6178 } else
6179#endif
6180 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
6181
6182 if (!queue->synflood_warned &&
6183 net->ipv4.sysctl_tcp_syncookies != 2 &&
6184 xchg(&queue->synflood_warned, 1) == 0)
6185 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
6186 proto, ntohs(tcp_hdr(skb)->dest), msg);
6187
6188 return want_cookie;
6189}
6190
6191static void tcp_reqsk_record_syn(const struct sock *sk,
6192 struct request_sock *req,
6193 const struct sk_buff *skb)
6194{
6195 if (tcp_sk(sk)->save_syn) {
6196 u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
6197 u32 *copy;
6198
6199 copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
6200 if (copy) {
6201 copy[0] = len;
6202 memcpy(©[1], skb_network_header(skb), len);
6203 req->saved_syn = copy;
6204 }
6205 }
6206}
6207
6208int tcp_conn_request(struct request_sock_ops *rsk_ops,
6209 const struct tcp_request_sock_ops *af_ops,
6210 struct sock *sk, struct sk_buff *skb)
6211{
6212 struct tcp_fastopen_cookie foc = { .len = -1 };
6213 __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
6214 struct tcp_options_received tmp_opt;
6215 struct tcp_sock *tp = tcp_sk(sk);
6216 struct net *net = sock_net(sk);
6217 struct sock *fastopen_sk = NULL;
6218 struct dst_entry *dst = NULL;
6219 struct request_sock *req;
6220 bool want_cookie = false;
6221 struct flowi fl;
6222
6223
6224
6225
6226
6227 if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
6228 inet_csk_reqsk_queue_is_full(sk)) && !isn) {
6229 want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
6230 if (!want_cookie)
6231 goto drop;
6232 }
6233
6234
6235
6236
6237
6238
6239
6240 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
6241 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
6242 goto drop;
6243 }
6244
6245 req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
6246 if (!req)
6247 goto drop;
6248
6249 tcp_rsk(req)->af_specific = af_ops;
6250
6251 tcp_clear_options(&tmp_opt);
6252 tmp_opt.mss_clamp = af_ops->mss_clamp;
6253 tmp_opt.user_mss = tp->rx_opt.user_mss;
6254 tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
6255
6256 if (want_cookie && !tmp_opt.saw_tstamp)
6257 tcp_clear_options(&tmp_opt);
6258
6259 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
6260 tcp_openreq_init(req, &tmp_opt, skb, sk);
6261
6262
6263 inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
6264
6265 af_ops->init_req(req, sk, skb);
6266
6267 if (security_inet_conn_request(sk, skb, req))
6268 goto drop_and_free;
6269
6270 if (!want_cookie && !isn) {
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280 if (tcp_death_row.sysctl_tw_recycle) {
6281 bool strict;
6282
6283 dst = af_ops->route_req(sk, &fl, req, &strict);
6284
6285 if (dst && strict &&
6286 !tcp_peer_is_proven(req, dst, true,
6287 tmp_opt.saw_tstamp)) {
6288 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
6289 goto drop_and_release;
6290 }
6291 }
6292
6293 else if (!net->ipv4.sysctl_tcp_syncookies &&
6294 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
6295 (sysctl_max_syn_backlog >> 2)) &&
6296 !tcp_peer_is_proven(req, dst, false,
6297 tmp_opt.saw_tstamp)) {
6298
6299
6300
6301
6302
6303
6304
6305 pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
6306 rsk_ops->family);
6307 goto drop_and_release;
6308 }
6309
6310 isn = af_ops->init_seq(skb);
6311 }
6312 if (!dst) {
6313 dst = af_ops->route_req(sk, &fl, req, NULL);
6314 if (!dst)
6315 goto drop_and_free;
6316 }
6317
6318 tcp_ecn_create_request(req, skb, sk, dst);
6319
6320 if (want_cookie) {
6321 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
6322 req->cookie_ts = tmp_opt.tstamp_ok;
6323 if (!tmp_opt.tstamp_ok)
6324 inet_rsk(req)->ecn_ok = 0;
6325 }
6326
6327 tcp_rsk(req)->snt_isn = isn;
6328 tcp_rsk(req)->txhash = net_tx_rndhash();
6329 tcp_openreq_init_rwin(req, sk, dst);
6330 if (!want_cookie) {
6331 tcp_reqsk_record_syn(sk, req, skb);
6332 fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
6333 }
6334 if (fastopen_sk) {
6335 af_ops->send_synack(fastopen_sk, dst, &fl, req,
6336 &foc, false);
6337
6338 inet_csk_reqsk_queue_add(sk, req, fastopen_sk);
6339 sk->sk_data_ready(sk);
6340 bh_unlock_sock(fastopen_sk);
6341 sock_put(fastopen_sk);
6342 } else {
6343 tcp_rsk(req)->tfo_listener = false;
6344 if (!want_cookie)
6345 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
6346 af_ops->send_synack(sk, dst, &fl, req,
6347 &foc, !want_cookie);
6348 if (want_cookie)
6349 goto drop_and_free;
6350 }
6351 reqsk_put(req);
6352 return 0;
6353
6354drop_and_release:
6355 dst_release(dst);
6356drop_and_free:
6357 reqsk_free(req);
6358drop:
6359 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
6360 return 0;
6361}
6362EXPORT_SYMBOL(tcp_conn_request);
6363