1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64#define pr_fmt(fmt) "TCP: " fmt
65
66#include <linux/mm.h>
67#include <linux/slab.h>
68#include <linux/module.h>
69#include <linux/sysctl.h>
70#include <linux/kernel.h>
71#include <linux/prefetch.h>
72#include <net/dst.h>
73#include <net/tcp.h>
74#include <net/inet_common.h>
75#include <linux/ipsec.h>
76#include <asm/unaligned.h>
77#include <linux/errqueue.h>
78
79int sysctl_tcp_timestamps __read_mostly = 1;
80int sysctl_tcp_window_scaling __read_mostly = 1;
81int sysctl_tcp_sack __read_mostly = 1;
82int sysctl_tcp_fack __read_mostly = 1;
83int sysctl_tcp_max_reordering __read_mostly = 300;
84int sysctl_tcp_dsack __read_mostly = 1;
85int sysctl_tcp_app_win __read_mostly = 31;
86int sysctl_tcp_adv_win_scale __read_mostly = 1;
87EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
88
89
90int sysctl_tcp_challenge_ack_limit = 1000;
91
92int sysctl_tcp_stdurg __read_mostly;
93int sysctl_tcp_rfc1337 __read_mostly;
94int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
95int sysctl_tcp_frto __read_mostly = 2;
96int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
97
98int sysctl_tcp_thin_dupack __read_mostly;
99
100int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
101int sysctl_tcp_early_retrans __read_mostly = 3;
102int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
103
104#define FLAG_DATA 0x01
105#define FLAG_WIN_UPDATE 0x02
106#define FLAG_DATA_ACKED 0x04
107#define FLAG_RETRANS_DATA_ACKED 0x08
108#define FLAG_SYN_ACKED 0x10
109#define FLAG_DATA_SACKED 0x20
110#define FLAG_ECE 0x40
111#define FLAG_LOST_RETRANS 0x80
112#define FLAG_SLOWPATH 0x100
113#define FLAG_ORIG_SACK_ACKED 0x200
114#define FLAG_SND_UNA_ADVANCED 0x400
115#define FLAG_DSACKING_ACK 0x800
116#define FLAG_SACK_RENEGING 0x2000
117#define FLAG_UPDATE_TS_RECENT 0x4000
118
119#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
120#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
121#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
122#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
123
124#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
125#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
126
127#define REXMIT_NONE 0
128#define REXMIT_LOST 1
129#define REXMIT_NEW 2
130
131
132
133
134static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
135{
136 struct inet_connection_sock *icsk = inet_csk(sk);
137 const unsigned int lss = icsk->icsk_ack.last_seg_size;
138 unsigned int len;
139
140 icsk->icsk_ack.last_seg_size = 0;
141
142
143
144
145 len = skb_shinfo(skb)->gso_size ? : skb->len;
146 if (len >= icsk->icsk_ack.rcv_mss) {
147 icsk->icsk_ack.rcv_mss = len;
148 } else {
149
150
151
152
153
154 len += skb->data - skb_transport_header(skb);
155 if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
156
157
158
159
160
161 (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
162 !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
163
164
165
166
167 len -= tcp_sk(sk)->tcp_header_len;
168 icsk->icsk_ack.last_seg_size = len;
169 if (len == lss) {
170 icsk->icsk_ack.rcv_mss = len;
171 return;
172 }
173 }
174 if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
175 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
176 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
177 }
178}
179
180static void tcp_incr_quickack(struct sock *sk)
181{
182 struct inet_connection_sock *icsk = inet_csk(sk);
183 unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
184
185 if (quickacks == 0)
186 quickacks = 2;
187 if (quickacks > icsk->icsk_ack.quick)
188 icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
189}
190
191static void tcp_enter_quickack_mode(struct sock *sk)
192{
193 struct inet_connection_sock *icsk = inet_csk(sk);
194 tcp_incr_quickack(sk);
195 icsk->icsk_ack.pingpong = 0;
196 icsk->icsk_ack.ato = TCP_ATO_MIN;
197}
198
199
200
201
202
203static bool tcp_in_quickack_mode(struct sock *sk)
204{
205 const struct inet_connection_sock *icsk = inet_csk(sk);
206 const struct dst_entry *dst = __sk_dst_get(sk);
207
208 return (dst && dst_metric(dst, RTAX_QUICKACK)) ||
209 (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);
210}
211
212static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
213{
214 if (tp->ecn_flags & TCP_ECN_OK)
215 tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
216}
217
218static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
219{
220 if (tcp_hdr(skb)->cwr)
221 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
222}
223
224static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
225{
226 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
227}
228
229static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
230{
231 switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
232 case INET_ECN_NOT_ECT:
233
234
235
236
237 if (tp->ecn_flags & TCP_ECN_SEEN)
238 tcp_enter_quickack_mode((struct sock *)tp);
239 break;
240 case INET_ECN_CE:
241 if (tcp_ca_needs_ecn((struct sock *)tp))
242 tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
243
244 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
245
246 tcp_enter_quickack_mode((struct sock *)tp);
247 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
248 }
249 tp->ecn_flags |= TCP_ECN_SEEN;
250 break;
251 default:
252 if (tcp_ca_needs_ecn((struct sock *)tp))
253 tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
254 tp->ecn_flags |= TCP_ECN_SEEN;
255 break;
256 }
257}
258
259static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
260{
261 if (tp->ecn_flags & TCP_ECN_OK)
262 __tcp_ecn_check_ce(tp, skb);
263}
264
265static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
266{
267 if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
268 tp->ecn_flags &= ~TCP_ECN_OK;
269}
270
271static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
272{
273 if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
274 tp->ecn_flags &= ~TCP_ECN_OK;
275}
276
277static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
278{
279 if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
280 return true;
281 return false;
282}
283
284
285
286
287
288
289static void tcp_sndbuf_expand(struct sock *sk)
290{
291 const struct tcp_sock *tp = tcp_sk(sk);
292 int sndmem, per_mss;
293 u32 nr_segs;
294
295
296
297
298 per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
299 MAX_TCP_HEADER +
300 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
301
302 per_mss = roundup_pow_of_two(per_mss) +
303 SKB_DATA_ALIGN(sizeof(struct sk_buff));
304
305 nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
306 nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
307
308
309
310
311
312 sndmem = 2 * nr_segs * per_mss;
313
314 if (sk->sk_sndbuf < sndmem)
315 sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
316}
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
345{
346 struct tcp_sock *tp = tcp_sk(sk);
347
348 int truesize = tcp_win_from_space(skb->truesize) >> 1;
349 int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1;
350
351 while (tp->rcv_ssthresh <= window) {
352 if (truesize <= skb->len)
353 return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
354
355 truesize >>= 1;
356 window >>= 1;
357 }
358 return 0;
359}
360
361static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
362{
363 struct tcp_sock *tp = tcp_sk(sk);
364
365
366 if (tp->rcv_ssthresh < tp->window_clamp &&
367 (int)tp->rcv_ssthresh < tcp_space(sk) &&
368 !tcp_under_memory_pressure(sk)) {
369 int incr;
370
371
372
373
374 if (tcp_win_from_space(skb->truesize) <= skb->len)
375 incr = 2 * tp->advmss;
376 else
377 incr = __tcp_grow_window(sk, skb);
378
379 if (incr) {
380 incr = max_t(int, incr, 2 * skb->len);
381 tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
382 tp->window_clamp);
383 inet_csk(sk)->icsk_ack.quick |= 1;
384 }
385 }
386}
387
388
389static void tcp_fixup_rcvbuf(struct sock *sk)
390{
391 u32 mss = tcp_sk(sk)->advmss;
392 int rcvmem;
393
394 rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
395 tcp_default_init_rwnd(mss);
396
397
398
399
400 if (sysctl_tcp_moderate_rcvbuf)
401 rcvmem <<= 2;
402
403 if (sk->sk_rcvbuf < rcvmem)
404 sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
405}
406
407
408
409
410void tcp_init_buffer_space(struct sock *sk)
411{
412 struct tcp_sock *tp = tcp_sk(sk);
413 int maxwin;
414
415 if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
416 tcp_fixup_rcvbuf(sk);
417 if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
418 tcp_sndbuf_expand(sk);
419
420 tp->rcvq_space.space = tp->rcv_wnd;
421 tp->rcvq_space.time = tcp_time_stamp;
422 tp->rcvq_space.seq = tp->copied_seq;
423
424 maxwin = tcp_full_space(sk);
425
426 if (tp->window_clamp >= maxwin) {
427 tp->window_clamp = maxwin;
428
429 if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss)
430 tp->window_clamp = max(maxwin -
431 (maxwin >> sysctl_tcp_app_win),
432 4 * tp->advmss);
433 }
434
435
436 if (sysctl_tcp_app_win &&
437 tp->window_clamp > 2 * tp->advmss &&
438 tp->window_clamp + tp->advmss > maxwin)
439 tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
440
441 tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
442 tp->snd_cwnd_stamp = tcp_time_stamp;
443}
444
445
446static void tcp_clamp_window(struct sock *sk)
447{
448 struct tcp_sock *tp = tcp_sk(sk);
449 struct inet_connection_sock *icsk = inet_csk(sk);
450
451 icsk->icsk_ack.quick = 0;
452
453 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
454 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
455 !tcp_under_memory_pressure(sk) &&
456 sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
457 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
458 sysctl_tcp_rmem[2]);
459 }
460 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
461 tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
462}
463
464
465
466
467
468
469
470
471void tcp_initialize_rcv_mss(struct sock *sk)
472{
473 const struct tcp_sock *tp = tcp_sk(sk);
474 unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
475
476 hint = min(hint, tp->rcv_wnd / 2);
477 hint = min(hint, TCP_MSS_DEFAULT);
478 hint = max(hint, TCP_MIN_MSS);
479
480 inet_csk(sk)->icsk_ack.rcv_mss = hint;
481}
482EXPORT_SYMBOL(tcp_initialize_rcv_mss);
483
484
485
486
487
488
489
490
491
492
493
494
495static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
496{
497 u32 new_sample = tp->rcv_rtt_est.rtt;
498 long m = sample;
499
500 if (m == 0)
501 m = 1;
502
503 if (new_sample != 0) {
504
505
506
507
508
509
510
511
512
513
514 if (!win_dep) {
515 m -= (new_sample >> 3);
516 new_sample += m;
517 } else {
518 m <<= 3;
519 if (m < new_sample)
520 new_sample = m;
521 }
522 } else {
523
524 new_sample = m << 3;
525 }
526
527 if (tp->rcv_rtt_est.rtt != new_sample)
528 tp->rcv_rtt_est.rtt = new_sample;
529}
530
531static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
532{
533 if (tp->rcv_rtt_est.time == 0)
534 goto new_measure;
535 if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
536 return;
537 tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1);
538
539new_measure:
540 tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
541 tp->rcv_rtt_est.time = tcp_time_stamp;
542}
543
544static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
545 const struct sk_buff *skb)
546{
547 struct tcp_sock *tp = tcp_sk(sk);
548 if (tp->rx_opt.rcv_tsecr &&
549 (TCP_SKB_CB(skb)->end_seq -
550 TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
551 tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
552}
553
554
555
556
557
558void tcp_rcv_space_adjust(struct sock *sk)
559{
560 struct tcp_sock *tp = tcp_sk(sk);
561 int time;
562 int copied;
563
564 time = tcp_time_stamp - tp->rcvq_space.time;
565 if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
566 return;
567
568
569 copied = tp->copied_seq - tp->rcvq_space.seq;
570 if (copied <= tp->rcvq_space.space)
571 goto new_measure;
572
573
574
575
576
577
578
579
580
581
582 if (sysctl_tcp_moderate_rcvbuf &&
583 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
584 int rcvwin, rcvmem, rcvbuf;
585
586
587
588
589 rcvwin = (copied << 1) + 16 * tp->advmss;
590
591
592
593
594
595
596 if (copied >=
597 tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
598 if (copied >=
599 tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
600 rcvwin <<= 1;
601 else
602 rcvwin += (rcvwin >> 1);
603 }
604
605 rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
606 while (tcp_win_from_space(rcvmem) < tp->advmss)
607 rcvmem += 128;
608
609 rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
610 if (rcvbuf > sk->sk_rcvbuf) {
611 sk->sk_rcvbuf = rcvbuf;
612
613
614 tp->window_clamp = rcvwin;
615 }
616 }
617 tp->rcvq_space.space = copied;
618
619new_measure:
620 tp->rcvq_space.seq = tp->copied_seq;
621 tp->rcvq_space.time = tcp_time_stamp;
622}
623
624
625
626
627
628
629
630
631
632
633
634static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
635{
636 struct tcp_sock *tp = tcp_sk(sk);
637 struct inet_connection_sock *icsk = inet_csk(sk);
638 u32 now;
639
640 inet_csk_schedule_ack(sk);
641
642 tcp_measure_rcv_mss(sk, skb);
643
644 tcp_rcv_rtt_measure(tp);
645
646 now = tcp_time_stamp;
647
648 if (!icsk->icsk_ack.ato) {
649
650
651
652 tcp_incr_quickack(sk);
653 icsk->icsk_ack.ato = TCP_ATO_MIN;
654 } else {
655 int m = now - icsk->icsk_ack.lrcvtime;
656
657 if (m <= TCP_ATO_MIN / 2) {
658
659 icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
660 } else if (m < icsk->icsk_ack.ato) {
661 icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
662 if (icsk->icsk_ack.ato > icsk->icsk_rto)
663 icsk->icsk_ack.ato = icsk->icsk_rto;
664 } else if (m > icsk->icsk_rto) {
665
666
667
668 tcp_incr_quickack(sk);
669 sk_mem_reclaim(sk);
670 }
671 }
672 icsk->icsk_ack.lrcvtime = now;
673
674 tcp_ecn_check_ce(tp, skb);
675
676 if (skb->len >= 128)
677 tcp_grow_window(sk, skb);
678}
679
680
681
682
683
684
685
686
687
688
689static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
690{
691 struct tcp_sock *tp = tcp_sk(sk);
692 long m = mrtt_us;
693 u32 srtt = tp->srtt_us;
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711 if (srtt != 0) {
712 m -= (srtt >> 3);
713 srtt += m;
714 if (m < 0) {
715 m = -m;
716 m -= (tp->mdev_us >> 2);
717
718
719
720
721
722
723
724
725 if (m > 0)
726 m >>= 3;
727 } else {
728 m -= (tp->mdev_us >> 2);
729 }
730 tp->mdev_us += m;
731 if (tp->mdev_us > tp->mdev_max_us) {
732 tp->mdev_max_us = tp->mdev_us;
733 if (tp->mdev_max_us > tp->rttvar_us)
734 tp->rttvar_us = tp->mdev_max_us;
735 }
736 if (after(tp->snd_una, tp->rtt_seq)) {
737 if (tp->mdev_max_us < tp->rttvar_us)
738 tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
739 tp->rtt_seq = tp->snd_nxt;
740 tp->mdev_max_us = tcp_rto_min_us(sk);
741 }
742 } else {
743
744 srtt = m << 3;
745 tp->mdev_us = m << 1;
746 tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
747 tp->mdev_max_us = tp->rttvar_us;
748 tp->rtt_seq = tp->snd_nxt;
749 }
750 tp->srtt_us = max(1U, srtt);
751}
752
753
754
755
756
757
758
759int sysctl_tcp_pacing_ss_ratio __read_mostly = 200;
760int sysctl_tcp_pacing_ca_ratio __read_mostly = 120;
761
762static void tcp_update_pacing_rate(struct sock *sk)
763{
764 const struct tcp_sock *tp = tcp_sk(sk);
765 u64 rate;
766
767
768 rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
769
770
771
772
773
774
775
776
777
778 if (tp->snd_cwnd < tp->snd_ssthresh / 2)
779 rate *= sysctl_tcp_pacing_ss_ratio;
780 else
781 rate *= sysctl_tcp_pacing_ca_ratio;
782
783 rate *= max(tp->snd_cwnd, tp->packets_out);
784
785 if (likely(tp->srtt_us))
786 do_div(rate, tp->srtt_us);
787
788
789
790
791
792 ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate,
793 sk->sk_max_pacing_rate);
794}
795
796
797
798
799static void tcp_set_rto(struct sock *sk)
800{
801 const struct tcp_sock *tp = tcp_sk(sk);
802
803
804
805
806
807
808
809
810
811
812 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
813
814
815
816
817
818
819
820
821
822
823 tcp_bound_rto(sk);
824}
825
826__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
827{
828 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
829
830 if (!cwnd)
831 cwnd = TCP_INIT_CWND;
832 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
833}
834
835
836
837
838
839void tcp_disable_fack(struct tcp_sock *tp)
840{
841
842 if (tcp_is_fack(tp))
843 tp->lost_skb_hint = NULL;
844 tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED;
845}
846
847
848static void tcp_dsack_seen(struct tcp_sock *tp)
849{
850 tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
851}
852
853static void tcp_update_reordering(struct sock *sk, const int metric,
854 const int ts)
855{
856 struct tcp_sock *tp = tcp_sk(sk);
857 if (metric > tp->reordering) {
858 int mib_idx;
859
860 tp->reordering = min(sysctl_tcp_max_reordering, metric);
861
862
863 if (ts)
864 mib_idx = LINUX_MIB_TCPTSREORDER;
865 else if (tcp_is_reno(tp))
866 mib_idx = LINUX_MIB_TCPRENOREORDER;
867 else if (tcp_is_fack(tp))
868 mib_idx = LINUX_MIB_TCPFACKREORDER;
869 else
870 mib_idx = LINUX_MIB_TCPSACKREORDER;
871
872 NET_INC_STATS(sock_net(sk), mib_idx);
873#if FASTRETRANS_DEBUG > 1
874 pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
875 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
876 tp->reordering,
877 tp->fackets_out,
878 tp->sacked_out,
879 tp->undo_marker ? tp->undo_retrans : 0);
880#endif
881 tcp_disable_fack(tp);
882 }
883
884 if (metric > 0)
885 tcp_disable_early_retrans(tp);
886 tp->rack.reord = 1;
887}
888
889
890static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
891{
892 if (!tp->retransmit_skb_hint ||
893 before(TCP_SKB_CB(skb)->seq,
894 TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
895 tp->retransmit_skb_hint = skb;
896
897 if (!tp->lost_out ||
898 after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
899 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
900}
901
902static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
903{
904 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
905 tcp_verify_retransmit_hint(tp, skb);
906
907 tp->lost_out += tcp_skb_pcount(skb);
908 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
909 }
910}
911
912void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
913{
914 tcp_verify_retransmit_hint(tp, skb);
915
916 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
917 tp->lost_out += tcp_skb_pcount(skb);
918 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
919 }
920}
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
1017 u32 start_seq, u32 end_seq)
1018{
1019
1020 if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
1021 return false;
1022
1023
1024 if (!before(start_seq, tp->snd_nxt))
1025 return false;
1026
1027
1028
1029
1030 if (after(start_seq, tp->snd_una))
1031 return true;
1032
1033 if (!is_dsack || !tp->undo_marker)
1034 return false;
1035
1036
1037 if (after(end_seq, tp->snd_una))
1038 return false;
1039
1040 if (!before(start_seq, tp->undo_marker))
1041 return true;
1042
1043
1044 if (!after(end_seq, tp->undo_marker))
1045 return false;
1046
1047
1048
1049
1050 return !before(start_seq, end_seq - tp->max_window);
1051}
1052
1053static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
1054 struct tcp_sack_block_wire *sp, int num_sacks,
1055 u32 prior_snd_una)
1056{
1057 struct tcp_sock *tp = tcp_sk(sk);
1058 u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
1059 u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
1060 bool dup_sack = false;
1061
1062 if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
1063 dup_sack = true;
1064 tcp_dsack_seen(tp);
1065 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
1066 } else if (num_sacks > 1) {
1067 u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
1068 u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
1069
1070 if (!after(end_seq_0, end_seq_1) &&
1071 !before(start_seq_0, start_seq_1)) {
1072 dup_sack = true;
1073 tcp_dsack_seen(tp);
1074 NET_INC_STATS(sock_net(sk),
1075 LINUX_MIB_TCPDSACKOFORECV);
1076 }
1077 }
1078
1079
1080 if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
1081 !after(end_seq_0, prior_snd_una) &&
1082 after(end_seq_0, tp->undo_marker))
1083 tp->undo_retrans--;
1084
1085 return dup_sack;
1086}
1087
1088struct tcp_sacktag_state {
1089 int reord;
1090 int fack_count;
1091
1092
1093
1094
1095 struct skb_mstamp first_sackt;
1096 struct skb_mstamp last_sackt;
1097 int flag;
1098};
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1109 u32 start_seq, u32 end_seq)
1110{
1111 int err;
1112 bool in_sack;
1113 unsigned int pkt_len;
1114 unsigned int mss;
1115
1116 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1117 !before(end_seq, TCP_SKB_CB(skb)->end_seq);
1118
1119 if (tcp_skb_pcount(skb) > 1 && !in_sack &&
1120 after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
1121 mss = tcp_skb_mss(skb);
1122 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1123
1124 if (!in_sack) {
1125 pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
1126 if (pkt_len < mss)
1127 pkt_len = mss;
1128 } else {
1129 pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
1130 if (pkt_len < mss)
1131 return -EINVAL;
1132 }
1133
1134
1135
1136
1137 if (pkt_len > mss) {
1138 unsigned int new_len = (pkt_len / mss) * mss;
1139 if (!in_sack && new_len < pkt_len) {
1140 new_len += mss;
1141 if (new_len >= skb->len)
1142 return 0;
1143 }
1144 pkt_len = new_len;
1145 }
1146 err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
1147 if (err < 0)
1148 return err;
1149 }
1150
1151 return in_sack;
1152}
1153
1154
1155static u8 tcp_sacktag_one(struct sock *sk,
1156 struct tcp_sacktag_state *state, u8 sacked,
1157 u32 start_seq, u32 end_seq,
1158 int dup_sack, int pcount,
1159 const struct skb_mstamp *xmit_time)
1160{
1161 struct tcp_sock *tp = tcp_sk(sk);
1162 int fack_count = state->fack_count;
1163
1164
1165 if (dup_sack && (sacked & TCPCB_RETRANS)) {
1166 if (tp->undo_marker && tp->undo_retrans > 0 &&
1167 after(end_seq, tp->undo_marker))
1168 tp->undo_retrans--;
1169 if (sacked & TCPCB_SACKED_ACKED)
1170 state->reord = min(fack_count, state->reord);
1171 }
1172
1173
1174 if (!after(end_seq, tp->snd_una))
1175 return sacked;
1176
1177 if (!(sacked & TCPCB_SACKED_ACKED)) {
1178 tcp_rack_advance(tp, xmit_time, sacked);
1179
1180 if (sacked & TCPCB_SACKED_RETRANS) {
1181
1182
1183
1184
1185 if (sacked & TCPCB_LOST) {
1186 sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
1187 tp->lost_out -= pcount;
1188 tp->retrans_out -= pcount;
1189 }
1190 } else {
1191 if (!(sacked & TCPCB_RETRANS)) {
1192
1193
1194
1195 if (before(start_seq,
1196 tcp_highest_sack_seq(tp)))
1197 state->reord = min(fack_count,
1198 state->reord);
1199 if (!after(end_seq, tp->high_seq))
1200 state->flag |= FLAG_ORIG_SACK_ACKED;
1201 if (state->first_sackt.v64 == 0)
1202 state->first_sackt = *xmit_time;
1203 state->last_sackt = *xmit_time;
1204 }
1205
1206 if (sacked & TCPCB_LOST) {
1207 sacked &= ~TCPCB_LOST;
1208 tp->lost_out -= pcount;
1209 }
1210 }
1211
1212 sacked |= TCPCB_SACKED_ACKED;
1213 state->flag |= FLAG_DATA_SACKED;
1214 tp->sacked_out += pcount;
1215 tp->delivered += pcount;
1216
1217 fack_count += pcount;
1218
1219
1220 if (!tcp_is_fack(tp) && tp->lost_skb_hint &&
1221 before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
1222 tp->lost_cnt_hint += pcount;
1223
1224 if (fack_count > tp->fackets_out)
1225 tp->fackets_out = fack_count;
1226 }
1227
1228
1229
1230
1231
1232 if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
1233 sacked &= ~TCPCB_SACKED_RETRANS;
1234 tp->retrans_out -= pcount;
1235 }
1236
1237 return sacked;
1238}
1239
1240
1241
1242
1243static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1244 struct tcp_sacktag_state *state,
1245 unsigned int pcount, int shifted, int mss,
1246 bool dup_sack)
1247{
1248 struct tcp_sock *tp = tcp_sk(sk);
1249 struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
1250 u32 start_seq = TCP_SKB_CB(skb)->seq;
1251 u32 end_seq = start_seq + shifted;
1252
1253 BUG_ON(!pcount);
1254
1255
1256
1257
1258
1259
1260
1261 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
1262 start_seq, end_seq, dup_sack, pcount,
1263 &skb->skb_mstamp);
1264
1265 if (skb == tp->lost_skb_hint)
1266 tp->lost_cnt_hint += pcount;
1267
1268 TCP_SKB_CB(prev)->end_seq += shifted;
1269 TCP_SKB_CB(skb)->seq += shifted;
1270
1271 tcp_skb_pcount_add(prev, pcount);
1272 BUG_ON(tcp_skb_pcount(skb) < pcount);
1273 tcp_skb_pcount_add(skb, -pcount);
1274
1275
1276
1277
1278
1279
1280 if (!TCP_SKB_CB(prev)->tcp_gso_size)
1281 TCP_SKB_CB(prev)->tcp_gso_size = mss;
1282
1283
1284 if (tcp_skb_pcount(skb) <= 1)
1285 TCP_SKB_CB(skb)->tcp_gso_size = 0;
1286
1287
1288 TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
1289
1290 if (skb->len > 0) {
1291 BUG_ON(!tcp_skb_pcount(skb));
1292 NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED);
1293 return false;
1294 }
1295
1296
1297
1298 if (skb == tp->retransmit_skb_hint)
1299 tp->retransmit_skb_hint = prev;
1300 if (skb == tp->lost_skb_hint) {
1301 tp->lost_skb_hint = prev;
1302 tp->lost_cnt_hint -= tcp_skb_pcount(prev);
1303 }
1304
1305 TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1306 TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
1307 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1308 TCP_SKB_CB(prev)->end_seq++;
1309
1310 if (skb == tcp_highest_sack(sk))
1311 tcp_advance_highest_sack(sk, skb);
1312
1313 tcp_skb_collapse_tstamp(prev, skb);
1314 tcp_unlink_write_queue(skb, sk);
1315 sk_wmem_free_skb(sk, skb);
1316
1317 NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
1318
1319 return true;
1320}
1321
1322
1323
1324
1325static int tcp_skb_seglen(const struct sk_buff *skb)
1326{
1327 return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
1328}
1329
1330
1331static int skb_can_shift(const struct sk_buff *skb)
1332{
1333 return !skb_headlen(skb) && skb_is_nonlinear(skb);
1334}
1335
1336
1337
1338
1339static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1340 struct tcp_sacktag_state *state,
1341 u32 start_seq, u32 end_seq,
1342 bool dup_sack)
1343{
1344 struct tcp_sock *tp = tcp_sk(sk);
1345 struct sk_buff *prev;
1346 int mss;
1347 int pcount = 0;
1348 int len;
1349 int in_sack;
1350
1351 if (!sk_can_gso(sk))
1352 goto fallback;
1353
1354
1355 if (!dup_sack &&
1356 (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
1357 goto fallback;
1358 if (!skb_can_shift(skb))
1359 goto fallback;
1360
1361 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1362 goto fallback;
1363
1364
1365 if (unlikely(skb == tcp_write_queue_head(sk)))
1366 goto fallback;
1367 prev = tcp_write_queue_prev(sk, skb);
1368
1369 if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
1370 goto fallback;
1371
1372 if (!tcp_skb_can_collapse_to(prev))
1373 goto fallback;
1374
1375 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1376 !before(end_seq, TCP_SKB_CB(skb)->end_seq);
1377
1378 if (in_sack) {
1379 len = skb->len;
1380 pcount = tcp_skb_pcount(skb);
1381 mss = tcp_skb_seglen(skb);
1382
1383
1384
1385
1386 if (mss != tcp_skb_seglen(prev))
1387 goto fallback;
1388 } else {
1389 if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
1390 goto noop;
1391
1392
1393
1394
1395 if (tcp_skb_pcount(skb) <= 1)
1396 goto noop;
1397
1398 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1399 if (!in_sack) {
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411 goto fallback;
1412 }
1413
1414 len = end_seq - TCP_SKB_CB(skb)->seq;
1415 BUG_ON(len < 0);
1416 BUG_ON(len > skb->len);
1417
1418
1419
1420
1421
1422 mss = tcp_skb_mss(skb);
1423
1424
1425
1426
1427 if (mss != tcp_skb_seglen(prev))
1428 goto fallback;
1429
1430 if (len == mss) {
1431 pcount = 1;
1432 } else if (len < mss) {
1433 goto noop;
1434 } else {
1435 pcount = len / mss;
1436 len = pcount * mss;
1437 }
1438 }
1439
1440
1441 if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
1442 goto fallback;
1443
1444 if (!skb_shift(prev, skb, len))
1445 goto fallback;
1446 if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
1447 goto out;
1448
1449
1450
1451
1452 if (prev == tcp_write_queue_tail(sk))
1453 goto out;
1454 skb = tcp_write_queue_next(sk, prev);
1455
1456 if (!skb_can_shift(skb) ||
1457 (skb == tcp_send_head(sk)) ||
1458 ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
1459 (mss != tcp_skb_seglen(skb)))
1460 goto out;
1461
1462 len = skb->len;
1463 if (skb_shift(prev, skb, len)) {
1464 pcount += tcp_skb_pcount(skb);
1465 tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0);
1466 }
1467
1468out:
1469 state->fack_count += pcount;
1470 return prev;
1471
1472noop:
1473 return skb;
1474
1475fallback:
1476 NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
1477 return NULL;
1478}
1479
1480static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1481 struct tcp_sack_block *next_dup,
1482 struct tcp_sacktag_state *state,
1483 u32 start_seq, u32 end_seq,
1484 bool dup_sack_in)
1485{
1486 struct tcp_sock *tp = tcp_sk(sk);
1487 struct sk_buff *tmp;
1488
1489 tcp_for_write_queue_from(skb, sk) {
1490 int in_sack = 0;
1491 bool dup_sack = dup_sack_in;
1492
1493 if (skb == tcp_send_head(sk))
1494 break;
1495
1496
1497 if (!before(TCP_SKB_CB(skb)->seq, end_seq))
1498 break;
1499
1500 if (next_dup &&
1501 before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
1502 in_sack = tcp_match_skb_to_sack(sk, skb,
1503 next_dup->start_seq,
1504 next_dup->end_seq);
1505 if (in_sack > 0)
1506 dup_sack = true;
1507 }
1508
1509
1510
1511
1512
1513 if (in_sack <= 0) {
1514 tmp = tcp_shift_skb_data(sk, skb, state,
1515 start_seq, end_seq, dup_sack);
1516 if (tmp) {
1517 if (tmp != skb) {
1518 skb = tmp;
1519 continue;
1520 }
1521
1522 in_sack = 0;
1523 } else {
1524 in_sack = tcp_match_skb_to_sack(sk, skb,
1525 start_seq,
1526 end_seq);
1527 }
1528 }
1529
1530 if (unlikely(in_sack < 0))
1531 break;
1532
1533 if (in_sack) {
1534 TCP_SKB_CB(skb)->sacked =
1535 tcp_sacktag_one(sk,
1536 state,
1537 TCP_SKB_CB(skb)->sacked,
1538 TCP_SKB_CB(skb)->seq,
1539 TCP_SKB_CB(skb)->end_seq,
1540 dup_sack,
1541 tcp_skb_pcount(skb),
1542 &skb->skb_mstamp);
1543
1544 if (!before(TCP_SKB_CB(skb)->seq,
1545 tcp_highest_sack_seq(tp)))
1546 tcp_advance_highest_sack(sk, skb);
1547 }
1548
1549 state->fack_count += tcp_skb_pcount(skb);
1550 }
1551 return skb;
1552}
1553
1554
1555
1556
1557static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
1558 struct tcp_sacktag_state *state,
1559 u32 skip_to_seq)
1560{
1561 tcp_for_write_queue_from(skb, sk) {
1562 if (skb == tcp_send_head(sk))
1563 break;
1564
1565 if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
1566 break;
1567
1568 state->fack_count += tcp_skb_pcount(skb);
1569 }
1570 return skb;
1571}
1572
1573static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
1574 struct sock *sk,
1575 struct tcp_sack_block *next_dup,
1576 struct tcp_sacktag_state *state,
1577 u32 skip_to_seq)
1578{
1579 if (!next_dup)
1580 return skb;
1581
1582 if (before(next_dup->start_seq, skip_to_seq)) {
1583 skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq);
1584 skb = tcp_sacktag_walk(skb, sk, NULL, state,
1585 next_dup->start_seq, next_dup->end_seq,
1586 1);
1587 }
1588
1589 return skb;
1590}
1591
1592static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache)
1593{
1594 return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
1595}
1596
1597static int
1598tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1599 u32 prior_snd_una, struct tcp_sacktag_state *state)
1600{
1601 struct tcp_sock *tp = tcp_sk(sk);
1602 const unsigned char *ptr = (skb_transport_header(ack_skb) +
1603 TCP_SKB_CB(ack_skb)->sacked);
1604 struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
1605 struct tcp_sack_block sp[TCP_NUM_SACKS];
1606 struct tcp_sack_block *cache;
1607 struct sk_buff *skb;
1608 int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
1609 int used_sacks;
1610 bool found_dup_sack = false;
1611 int i, j;
1612 int first_sack_index;
1613
1614 state->flag = 0;
1615 state->reord = tp->packets_out;
1616
1617 if (!tp->sacked_out) {
1618 if (WARN_ON(tp->fackets_out))
1619 tp->fackets_out = 0;
1620 tcp_highest_sack_reset(sk);
1621 }
1622
1623 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
1624 num_sacks, prior_snd_una);
1625 if (found_dup_sack)
1626 state->flag |= FLAG_DSACKING_ACK;
1627
1628
1629
1630
1631
1632 if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
1633 return 0;
1634
1635 if (!tp->packets_out)
1636 goto out;
1637
1638 used_sacks = 0;
1639 first_sack_index = 0;
1640 for (i = 0; i < num_sacks; i++) {
1641 bool dup_sack = !i && found_dup_sack;
1642
1643 sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
1644 sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
1645
1646 if (!tcp_is_sackblock_valid(tp, dup_sack,
1647 sp[used_sacks].start_seq,
1648 sp[used_sacks].end_seq)) {
1649 int mib_idx;
1650
1651 if (dup_sack) {
1652 if (!tp->undo_marker)
1653 mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
1654 else
1655 mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
1656 } else {
1657
1658 if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
1659 !after(sp[used_sacks].end_seq, tp->snd_una))
1660 continue;
1661 mib_idx = LINUX_MIB_TCPSACKDISCARD;
1662 }
1663
1664 NET_INC_STATS(sock_net(sk), mib_idx);
1665 if (i == 0)
1666 first_sack_index = -1;
1667 continue;
1668 }
1669
1670
1671 if (!after(sp[used_sacks].end_seq, prior_snd_una))
1672 continue;
1673
1674 used_sacks++;
1675 }
1676
1677
1678 for (i = used_sacks - 1; i > 0; i--) {
1679 for (j = 0; j < i; j++) {
1680 if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
1681 swap(sp[j], sp[j + 1]);
1682
1683
1684 if (j == first_sack_index)
1685 first_sack_index = j + 1;
1686 }
1687 }
1688 }
1689
1690 skb = tcp_write_queue_head(sk);
1691 state->fack_count = 0;
1692 i = 0;
1693
1694 if (!tp->sacked_out) {
1695
1696 cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
1697 } else {
1698 cache = tp->recv_sack_cache;
1699
1700 while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
1701 !cache->end_seq)
1702 cache++;
1703 }
1704
1705 while (i < used_sacks) {
1706 u32 start_seq = sp[i].start_seq;
1707 u32 end_seq = sp[i].end_seq;
1708 bool dup_sack = (found_dup_sack && (i == first_sack_index));
1709 struct tcp_sack_block *next_dup = NULL;
1710
1711 if (found_dup_sack && ((i + 1) == first_sack_index))
1712 next_dup = &sp[i + 1];
1713
1714
1715 while (tcp_sack_cache_ok(tp, cache) &&
1716 !before(start_seq, cache->end_seq))
1717 cache++;
1718
1719
1720 if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
1721 after(end_seq, cache->start_seq)) {
1722
1723
1724 if (before(start_seq, cache->start_seq)) {
1725 skb = tcp_sacktag_skip(skb, sk, state,
1726 start_seq);
1727 skb = tcp_sacktag_walk(skb, sk, next_dup,
1728 state,
1729 start_seq,
1730 cache->start_seq,
1731 dup_sack);
1732 }
1733
1734
1735 if (!after(end_seq, cache->end_seq))
1736 goto advance_sp;
1737
1738 skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
1739 state,
1740 cache->end_seq);
1741
1742
1743 if (tcp_highest_sack_seq(tp) == cache->end_seq) {
1744
1745 skb = tcp_highest_sack(sk);
1746 if (!skb)
1747 break;
1748 state->fack_count = tp->fackets_out;
1749 cache++;
1750 goto walk;
1751 }
1752
1753 skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq);
1754
1755 cache++;
1756 continue;
1757 }
1758
1759 if (!before(start_seq, tcp_highest_sack_seq(tp))) {
1760 skb = tcp_highest_sack(sk);
1761 if (!skb)
1762 break;
1763 state->fack_count = tp->fackets_out;
1764 }
1765 skb = tcp_sacktag_skip(skb, sk, state, start_seq);
1766
1767walk:
1768 skb = tcp_sacktag_walk(skb, sk, next_dup, state,
1769 start_seq, end_seq, dup_sack);
1770
1771advance_sp:
1772 i++;
1773 }
1774
1775
1776 for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
1777 tp->recv_sack_cache[i].start_seq = 0;
1778 tp->recv_sack_cache[i].end_seq = 0;
1779 }
1780 for (j = 0; j < used_sacks; j++)
1781 tp->recv_sack_cache[i++] = sp[j];
1782
1783 if ((state->reord < tp->fackets_out) &&
1784 ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
1785 tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
1786
1787 tcp_verify_left_out(tp);
1788out:
1789
1790#if FASTRETRANS_DEBUG > 0
1791 WARN_ON((int)tp->sacked_out < 0);
1792 WARN_ON((int)tp->lost_out < 0);
1793 WARN_ON((int)tp->retrans_out < 0);
1794 WARN_ON((int)tcp_packets_in_flight(tp) < 0);
1795#endif
1796 return state->flag;
1797}
1798
1799
1800
1801
1802static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
1803{
1804 u32 holes;
1805
1806 holes = max(tp->lost_out, 1U);
1807 holes = min(holes, tp->packets_out);
1808
1809 if ((tp->sacked_out + holes) > tp->packets_out) {
1810 tp->sacked_out = tp->packets_out - holes;
1811 return true;
1812 }
1813 return false;
1814}
1815
1816
1817
1818
1819
1820static void tcp_check_reno_reordering(struct sock *sk, const int addend)
1821{
1822 struct tcp_sock *tp = tcp_sk(sk);
1823 if (tcp_limit_reno_sacked(tp))
1824 tcp_update_reordering(sk, tp->packets_out + addend, 0);
1825}
1826
1827
1828
1829static void tcp_add_reno_sack(struct sock *sk)
1830{
1831 struct tcp_sock *tp = tcp_sk(sk);
1832 u32 prior_sacked = tp->sacked_out;
1833
1834 tp->sacked_out++;
1835 tcp_check_reno_reordering(sk, 0);
1836 if (tp->sacked_out > prior_sacked)
1837 tp->delivered++;
1838 tcp_verify_left_out(tp);
1839}
1840
1841
1842
1843static void tcp_remove_reno_sacks(struct sock *sk, int acked)
1844{
1845 struct tcp_sock *tp = tcp_sk(sk);
1846
1847 if (acked > 0) {
1848
1849 tp->delivered += max_t(int, acked - tp->sacked_out, 1);
1850 if (acked - 1 >= tp->sacked_out)
1851 tp->sacked_out = 0;
1852 else
1853 tp->sacked_out -= acked - 1;
1854 }
1855 tcp_check_reno_reordering(sk, acked);
1856 tcp_verify_left_out(tp);
1857}
1858
1859static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
1860{
1861 tp->sacked_out = 0;
1862}
1863
1864void tcp_clear_retrans(struct tcp_sock *tp)
1865{
1866 tp->retrans_out = 0;
1867 tp->lost_out = 0;
1868 tp->undo_marker = 0;
1869 tp->undo_retrans = -1;
1870 tp->fackets_out = 0;
1871 tp->sacked_out = 0;
1872}
1873
1874static inline void tcp_init_undo(struct tcp_sock *tp)
1875{
1876 tp->undo_marker = tp->snd_una;
1877
1878 tp->undo_retrans = tp->retrans_out ? : -1;
1879}
1880
1881
1882
1883
1884
1885void tcp_enter_loss(struct sock *sk)
1886{
1887 const struct inet_connection_sock *icsk = inet_csk(sk);
1888 struct tcp_sock *tp = tcp_sk(sk);
1889 struct net *net = sock_net(sk);
1890 struct sk_buff *skb;
1891 bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
1892 bool is_reneg;
1893
1894
1895 if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
1896 !after(tp->high_seq, tp->snd_una) ||
1897 (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
1898 tp->prior_ssthresh = tcp_current_ssthresh(sk);
1899 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1900 tcp_ca_event(sk, CA_EVENT_LOSS);
1901 tcp_init_undo(tp);
1902 }
1903 tp->snd_cwnd = 1;
1904 tp->snd_cwnd_cnt = 0;
1905 tp->snd_cwnd_stamp = tcp_time_stamp;
1906
1907 tp->retrans_out = 0;
1908 tp->lost_out = 0;
1909
1910 if (tcp_is_reno(tp))
1911 tcp_reset_reno_sack(tp);
1912
1913 skb = tcp_write_queue_head(sk);
1914 is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
1915 if (is_reneg) {
1916 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
1917 tp->sacked_out = 0;
1918 tp->fackets_out = 0;
1919 }
1920 tcp_clear_all_retrans_hints(tp);
1921
1922 tcp_for_write_queue(skb, sk) {
1923 if (skb == tcp_send_head(sk))
1924 break;
1925
1926 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
1927 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
1928 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
1929 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1930 tp->lost_out += tcp_skb_pcount(skb);
1931 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
1932 }
1933 }
1934 tcp_verify_left_out(tp);
1935
1936
1937
1938
1939 if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
1940 tp->sacked_out >= net->ipv4.sysctl_tcp_reordering)
1941 tp->reordering = min_t(unsigned int, tp->reordering,
1942 net->ipv4.sysctl_tcp_reordering);
1943 tcp_set_ca_state(sk, TCP_CA_Loss);
1944 tp->high_seq = tp->snd_nxt;
1945 tcp_ecn_queue_cwr(tp);
1946
1947
1948
1949
1950
1951 tp->frto = sysctl_tcp_frto &&
1952 (new_recovery || icsk->icsk_retransmits) &&
1953 !inet_csk(sk)->icsk_mtup.probe_size;
1954}
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966static bool tcp_check_sack_reneging(struct sock *sk, int flag)
1967{
1968 if (flag & FLAG_SACK_RENEGING) {
1969 struct tcp_sock *tp = tcp_sk(sk);
1970 unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
1971 msecs_to_jiffies(10));
1972
1973 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1974 delay, TCP_RTO_MAX);
1975 return true;
1976 }
1977 return false;
1978}
1979
1980static inline int tcp_fackets_out(const struct tcp_sock *tp)
1981{
1982 return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
1983}
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
2001{
2002 return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
2003}
2004
2005static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
2006{
2007 struct tcp_sock *tp = tcp_sk(sk);
2008 unsigned long delay;
2009
2010
2011
2012
2013
2014 if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
2015 (flag & FLAG_ECE) || !tp->srtt_us)
2016 return false;
2017
2018 delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
2019 msecs_to_jiffies(2));
2020
2021 if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
2022 return false;
2023
2024 inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
2025 TCP_RTO_MAX);
2026 return true;
2027}
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122static bool tcp_time_to_recover(struct sock *sk, int flag)
2123{
2124 struct tcp_sock *tp = tcp_sk(sk);
2125 __u32 packets_out;
2126 int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
2127
2128
2129 if (tp->lost_out)
2130 return true;
2131
2132
2133 if (tcp_dupack_heuristics(tp) > tp->reordering)
2134 return true;
2135
2136
2137
2138
2139 packets_out = tp->packets_out;
2140 if (packets_out <= tp->reordering &&
2141 tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) &&
2142 !tcp_may_send_now(sk)) {
2143
2144
2145
2146 return true;
2147 }
2148
2149
2150
2151
2152
2153
2154 if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
2155 tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
2156 tcp_is_sack(tp) && !tcp_send_head(sk))
2157 return true;
2158
2159
2160
2161
2162
2163
2164 if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
2165 (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
2166 !tcp_may_send_now(sk))
2167 return !tcp_pause_early_retransmit(sk, flag);
2168
2169 return false;
2170}
2171
2172
2173
2174
2175
2176
2177
2178static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2179{
2180 struct tcp_sock *tp = tcp_sk(sk);
2181 struct sk_buff *skb;
2182 int cnt, oldcnt, lost;
2183 unsigned int mss;
2184
2185 const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
2186
2187 WARN_ON(packets > tp->packets_out);
2188 if (tp->lost_skb_hint) {
2189 skb = tp->lost_skb_hint;
2190 cnt = tp->lost_cnt_hint;
2191
2192 if (mark_head && skb != tcp_write_queue_head(sk))
2193 return;
2194 } else {
2195 skb = tcp_write_queue_head(sk);
2196 cnt = 0;
2197 }
2198
2199 tcp_for_write_queue_from(skb, sk) {
2200 if (skb == tcp_send_head(sk))
2201 break;
2202
2203
2204 tp->lost_skb_hint = skb;
2205 tp->lost_cnt_hint = cnt;
2206
2207 if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
2208 break;
2209
2210 oldcnt = cnt;
2211 if (tcp_is_fack(tp) || tcp_is_reno(tp) ||
2212 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
2213 cnt += tcp_skb_pcount(skb);
2214
2215 if (cnt > packets) {
2216 if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
2217 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
2218 (oldcnt >= packets))
2219 break;
2220
2221 mss = tcp_skb_mss(skb);
2222
2223 lost = (packets - oldcnt) * mss;
2224 if (lost < skb->len &&
2225 tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0)
2226 break;
2227 cnt = packets;
2228 }
2229
2230 tcp_skb_mark_lost(tp, skb);
2231
2232 if (mark_head)
2233 break;
2234 }
2235 tcp_verify_left_out(tp);
2236}
2237
2238
2239
2240static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
2241{
2242 struct tcp_sock *tp = tcp_sk(sk);
2243
2244 if (tcp_is_reno(tp)) {
2245 tcp_mark_head_lost(sk, 1, 1);
2246 } else if (tcp_is_fack(tp)) {
2247 int lost = tp->fackets_out - tp->reordering;
2248 if (lost <= 0)
2249 lost = 1;
2250 tcp_mark_head_lost(sk, lost, 0);
2251 } else {
2252 int sacked_upto = tp->sacked_out - tp->reordering;
2253 if (sacked_upto >= 0)
2254 tcp_mark_head_lost(sk, sacked_upto, 0);
2255 else if (fast_rexmit)
2256 tcp_mark_head_lost(sk, 1, 1);
2257 }
2258}
2259
2260static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
2261{
2262 return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
2263 before(tp->rx_opt.rcv_tsecr, when);
2264}
2265
2266
2267
2268
2269static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
2270 const struct sk_buff *skb)
2271{
2272 return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
2273 tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
2274}
2275
2276
2277
2278
2279static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
2280{
2281 return !tp->retrans_stamp ||
2282 tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
2283}
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301static bool tcp_any_retrans_done(const struct sock *sk)
2302{
2303 const struct tcp_sock *tp = tcp_sk(sk);
2304 struct sk_buff *skb;
2305
2306 if (tp->retrans_out)
2307 return true;
2308
2309 skb = tcp_write_queue_head(sk);
2310 if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
2311 return true;
2312
2313 return false;
2314}
2315
2316#if FASTRETRANS_DEBUG > 1
2317static void DBGUNDO(struct sock *sk, const char *msg)
2318{
2319 struct tcp_sock *tp = tcp_sk(sk);
2320 struct inet_sock *inet = inet_sk(sk);
2321
2322 if (sk->sk_family == AF_INET) {
2323 pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
2324 msg,
2325 &inet->inet_daddr, ntohs(inet->inet_dport),
2326 tp->snd_cwnd, tcp_left_out(tp),
2327 tp->snd_ssthresh, tp->prior_ssthresh,
2328 tp->packets_out);
2329 }
2330#if IS_ENABLED(CONFIG_IPV6)
2331 else if (sk->sk_family == AF_INET6) {
2332 struct ipv6_pinfo *np = inet6_sk(sk);
2333 pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
2334 msg,
2335 &np->daddr, ntohs(inet->inet_dport),
2336 tp->snd_cwnd, tcp_left_out(tp),
2337 tp->snd_ssthresh, tp->prior_ssthresh,
2338 tp->packets_out);
2339 }
2340#endif
2341}
2342#else
2343#define DBGUNDO(x...) do { } while (0)
2344#endif
2345
2346static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
2347{
2348 struct tcp_sock *tp = tcp_sk(sk);
2349
2350 if (unmark_loss) {
2351 struct sk_buff *skb;
2352
2353 tcp_for_write_queue(skb, sk) {
2354 if (skb == tcp_send_head(sk))
2355 break;
2356 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
2357 }
2358 tp->lost_out = 0;
2359 tcp_clear_all_retrans_hints(tp);
2360 }
2361
2362 if (tp->prior_ssthresh) {
2363 const struct inet_connection_sock *icsk = inet_csk(sk);
2364
2365 if (icsk->icsk_ca_ops->undo_cwnd)
2366 tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
2367 else
2368 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
2369
2370 if (tp->prior_ssthresh > tp->snd_ssthresh) {
2371 tp->snd_ssthresh = tp->prior_ssthresh;
2372 tcp_ecn_withdraw_cwr(tp);
2373 }
2374 }
2375 tp->snd_cwnd_stamp = tcp_time_stamp;
2376 tp->undo_marker = 0;
2377}
2378
2379static inline bool tcp_may_undo(const struct tcp_sock *tp)
2380{
2381 return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
2382}
2383
2384
2385static bool tcp_try_undo_recovery(struct sock *sk)
2386{
2387 struct tcp_sock *tp = tcp_sk(sk);
2388
2389 if (tcp_may_undo(tp)) {
2390 int mib_idx;
2391
2392
2393
2394
2395 DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
2396 tcp_undo_cwnd_reduction(sk, false);
2397 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
2398 mib_idx = LINUX_MIB_TCPLOSSUNDO;
2399 else
2400 mib_idx = LINUX_MIB_TCPFULLUNDO;
2401
2402 NET_INC_STATS(sock_net(sk), mib_idx);
2403 }
2404 if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
2405
2406
2407
2408 if (!tcp_any_retrans_done(sk))
2409 tp->retrans_stamp = 0;
2410 return true;
2411 }
2412 tcp_set_ca_state(sk, TCP_CA_Open);
2413 return false;
2414}
2415
2416
2417static bool tcp_try_undo_dsack(struct sock *sk)
2418{
2419 struct tcp_sock *tp = tcp_sk(sk);
2420
2421 if (tp->undo_marker && !tp->undo_retrans) {
2422 DBGUNDO(sk, "D-SACK");
2423 tcp_undo_cwnd_reduction(sk, false);
2424 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
2425 return true;
2426 }
2427 return false;
2428}
2429
2430
2431static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
2432{
2433 struct tcp_sock *tp = tcp_sk(sk);
2434
2435 if (frto_undo || tcp_may_undo(tp)) {
2436 tcp_undo_cwnd_reduction(sk, true);
2437
2438 DBGUNDO(sk, "partial loss");
2439 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
2440 if (frto_undo)
2441 NET_INC_STATS(sock_net(sk),
2442 LINUX_MIB_TCPSPURIOUSRTOS);
2443 inet_csk(sk)->icsk_retransmits = 0;
2444 if (frto_undo || tcp_is_sack(tp))
2445 tcp_set_ca_state(sk, TCP_CA_Open);
2446 return true;
2447 }
2448 return false;
2449}
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460static void tcp_init_cwnd_reduction(struct sock *sk)
2461{
2462 struct tcp_sock *tp = tcp_sk(sk);
2463
2464 tp->high_seq = tp->snd_nxt;
2465 tp->tlp_high_seq = 0;
2466 tp->snd_cwnd_cnt = 0;
2467 tp->prior_cwnd = tp->snd_cwnd;
2468 tp->prr_delivered = 0;
2469 tp->prr_out = 0;
2470 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
2471 tcp_ecn_queue_cwr(tp);
2472}
2473
2474static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
2475 int flag)
2476{
2477 struct tcp_sock *tp = tcp_sk(sk);
2478 int sndcnt = 0;
2479 int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
2480
2481 if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
2482 return;
2483
2484 tp->prr_delivered += newly_acked_sacked;
2485 if (delta < 0) {
2486 u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
2487 tp->prior_cwnd - 1;
2488 sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
2489 } else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
2490 !(flag & FLAG_LOST_RETRANS)) {
2491 sndcnt = min_t(int, delta,
2492 max_t(int, tp->prr_delivered - tp->prr_out,
2493 newly_acked_sacked) + 1);
2494 } else {
2495 sndcnt = min(delta, newly_acked_sacked);
2496 }
2497
2498 sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
2499 tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
2500}
2501
2502static inline void tcp_end_cwnd_reduction(struct sock *sk)
2503{
2504 struct tcp_sock *tp = tcp_sk(sk);
2505
2506
2507 if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
2508 (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
2509 tp->snd_cwnd = tp->snd_ssthresh;
2510 tp->snd_cwnd_stamp = tcp_time_stamp;
2511 }
2512 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
2513}
2514
2515
2516void tcp_enter_cwr(struct sock *sk)
2517{
2518 struct tcp_sock *tp = tcp_sk(sk);
2519
2520 tp->prior_ssthresh = 0;
2521 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2522 tp->undo_marker = 0;
2523 tcp_init_cwnd_reduction(sk);
2524 tcp_set_ca_state(sk, TCP_CA_CWR);
2525 }
2526}
2527EXPORT_SYMBOL(tcp_enter_cwr);
2528
2529static void tcp_try_keep_open(struct sock *sk)
2530{
2531 struct tcp_sock *tp = tcp_sk(sk);
2532 int state = TCP_CA_Open;
2533
2534 if (tcp_left_out(tp) || tcp_any_retrans_done(sk))
2535 state = TCP_CA_Disorder;
2536
2537 if (inet_csk(sk)->icsk_ca_state != state) {
2538 tcp_set_ca_state(sk, state);
2539 tp->high_seq = tp->snd_nxt;
2540 }
2541}
2542
2543static void tcp_try_to_open(struct sock *sk, int flag)
2544{
2545 struct tcp_sock *tp = tcp_sk(sk);
2546
2547 tcp_verify_left_out(tp);
2548
2549 if (!tcp_any_retrans_done(sk))
2550 tp->retrans_stamp = 0;
2551
2552 if (flag & FLAG_ECE)
2553 tcp_enter_cwr(sk);
2554
2555 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
2556 tcp_try_keep_open(sk);
2557 }
2558}
2559
2560static void tcp_mtup_probe_failed(struct sock *sk)
2561{
2562 struct inet_connection_sock *icsk = inet_csk(sk);
2563
2564 icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
2565 icsk->icsk_mtup.probe_size = 0;
2566 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
2567}
2568
2569static void tcp_mtup_probe_success(struct sock *sk)
2570{
2571 struct tcp_sock *tp = tcp_sk(sk);
2572 struct inet_connection_sock *icsk = inet_csk(sk);
2573
2574
2575 tp->prior_ssthresh = tcp_current_ssthresh(sk);
2576 tp->snd_cwnd = tp->snd_cwnd *
2577 tcp_mss_to_mtu(sk, tp->mss_cache) /
2578 icsk->icsk_mtup.probe_size;
2579 tp->snd_cwnd_cnt = 0;
2580 tp->snd_cwnd_stamp = tcp_time_stamp;
2581 tp->snd_ssthresh = tcp_current_ssthresh(sk);
2582
2583 icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
2584 icsk->icsk_mtup.probe_size = 0;
2585 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
2586 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
2587}
2588
2589
2590
2591
2592
2593void tcp_simple_retransmit(struct sock *sk)
2594{
2595 const struct inet_connection_sock *icsk = inet_csk(sk);
2596 struct tcp_sock *tp = tcp_sk(sk);
2597 struct sk_buff *skb;
2598 unsigned int mss = tcp_current_mss(sk);
2599 u32 prior_lost = tp->lost_out;
2600
2601 tcp_for_write_queue(skb, sk) {
2602 if (skb == tcp_send_head(sk))
2603 break;
2604 if (tcp_skb_seglen(skb) > mss &&
2605 !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
2606 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
2607 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
2608 tp->retrans_out -= tcp_skb_pcount(skb);
2609 }
2610 tcp_skb_mark_lost_uncond_verify(tp, skb);
2611 }
2612 }
2613
2614 tcp_clear_retrans_hints_partial(tp);
2615
2616 if (prior_lost == tp->lost_out)
2617 return;
2618
2619 if (tcp_is_reno(tp))
2620 tcp_limit_reno_sacked(tp);
2621
2622 tcp_verify_left_out(tp);
2623
2624
2625
2626
2627
2628
2629 if (icsk->icsk_ca_state != TCP_CA_Loss) {
2630 tp->high_seq = tp->snd_nxt;
2631 tp->snd_ssthresh = tcp_current_ssthresh(sk);
2632 tp->prior_ssthresh = 0;
2633 tp->undo_marker = 0;
2634 tcp_set_ca_state(sk, TCP_CA_Loss);
2635 }
2636 tcp_xmit_retransmit_queue(sk);
2637}
2638EXPORT_SYMBOL(tcp_simple_retransmit);
2639
2640static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2641{
2642 struct tcp_sock *tp = tcp_sk(sk);
2643 int mib_idx;
2644
2645 if (tcp_is_reno(tp))
2646 mib_idx = LINUX_MIB_TCPRENORECOVERY;
2647 else
2648 mib_idx = LINUX_MIB_TCPSACKRECOVERY;
2649
2650 NET_INC_STATS(sock_net(sk), mib_idx);
2651
2652 tp->prior_ssthresh = 0;
2653 tcp_init_undo(tp);
2654
2655 if (!tcp_in_cwnd_reduction(sk)) {
2656 if (!ece_ack)
2657 tp->prior_ssthresh = tcp_current_ssthresh(sk);
2658 tcp_init_cwnd_reduction(sk);
2659 }
2660 tcp_set_ca_state(sk, TCP_CA_Recovery);
2661}
2662
2663
2664
2665
2666static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
2667 int *rexmit)
2668{
2669 struct tcp_sock *tp = tcp_sk(sk);
2670 bool recovered = !before(tp->snd_una, tp->high_seq);
2671
2672 if ((flag & FLAG_SND_UNA_ADVANCED) &&
2673 tcp_try_undo_loss(sk, false))
2674 return;
2675
2676 if (tp->frto) {
2677
2678
2679
2680 if ((flag & FLAG_ORIG_SACK_ACKED) &&
2681 tcp_try_undo_loss(sk, true))
2682 return;
2683
2684 if (after(tp->snd_nxt, tp->high_seq)) {
2685 if (flag & FLAG_DATA_SACKED || is_dupack)
2686 tp->frto = 0;
2687 } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
2688 tp->high_seq = tp->snd_nxt;
2689
2690
2691
2692
2693 if (tcp_send_head(sk) &&
2694 after(tcp_wnd_end(tp), tp->snd_nxt)) {
2695 *rexmit = REXMIT_NEW;
2696 return;
2697 }
2698 tp->frto = 0;
2699 }
2700 }
2701
2702 if (recovered) {
2703
2704 tcp_try_undo_recovery(sk);
2705 return;
2706 }
2707 if (tcp_is_reno(tp)) {
2708
2709
2710
2711 if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
2712 tcp_add_reno_sack(sk);
2713 else if (flag & FLAG_SND_UNA_ADVANCED)
2714 tcp_reset_reno_sack(tp);
2715 }
2716 *rexmit = REXMIT_LOST;
2717}
2718
2719
2720static bool tcp_try_undo_partial(struct sock *sk, const int acked)
2721{
2722 struct tcp_sock *tp = tcp_sk(sk);
2723
2724 if (tp->undo_marker && tcp_packet_delayed(tp)) {
2725
2726
2727
2728 tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
2729
2730
2731
2732
2733
2734
2735 if (tp->retrans_out)
2736 return true;
2737
2738 if (!tcp_any_retrans_done(sk))
2739 tp->retrans_stamp = 0;
2740
2741 DBGUNDO(sk, "partial recovery");
2742 tcp_undo_cwnd_reduction(sk, true);
2743 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
2744 tcp_try_keep_open(sk);
2745 return true;
2746 }
2747 return false;
2748}
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2763 bool is_dupack, int *ack_flag, int *rexmit)
2764{
2765 struct inet_connection_sock *icsk = inet_csk(sk);
2766 struct tcp_sock *tp = tcp_sk(sk);
2767 int fast_rexmit = 0, flag = *ack_flag;
2768 bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
2769 (tcp_fackets_out(tp) > tp->reordering));
2770
2771 if (WARN_ON(!tp->packets_out && tp->sacked_out))
2772 tp->sacked_out = 0;
2773 if (WARN_ON(!tp->sacked_out && tp->fackets_out))
2774 tp->fackets_out = 0;
2775
2776
2777
2778 if (flag & FLAG_ECE)
2779 tp->prior_ssthresh = 0;
2780
2781
2782 if (tcp_check_sack_reneging(sk, flag))
2783 return;
2784
2785
2786 tcp_verify_left_out(tp);
2787
2788
2789
2790 if (icsk->icsk_ca_state == TCP_CA_Open) {
2791 WARN_ON(tp->retrans_out != 0);
2792 tp->retrans_stamp = 0;
2793 } else if (!before(tp->snd_una, tp->high_seq)) {
2794 switch (icsk->icsk_ca_state) {
2795 case TCP_CA_CWR:
2796
2797
2798 if (tp->snd_una != tp->high_seq) {
2799 tcp_end_cwnd_reduction(sk);
2800 tcp_set_ca_state(sk, TCP_CA_Open);
2801 }
2802 break;
2803
2804 case TCP_CA_Recovery:
2805 if (tcp_is_reno(tp))
2806 tcp_reset_reno_sack(tp);
2807 if (tcp_try_undo_recovery(sk))
2808 return;
2809 tcp_end_cwnd_reduction(sk);
2810 break;
2811 }
2812 }
2813
2814
2815 if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
2816 tcp_rack_mark_lost(sk)) {
2817 flag |= FLAG_LOST_RETRANS;
2818 *ack_flag |= FLAG_LOST_RETRANS;
2819 }
2820
2821
2822 switch (icsk->icsk_ca_state) {
2823 case TCP_CA_Recovery:
2824 if (!(flag & FLAG_SND_UNA_ADVANCED)) {
2825 if (tcp_is_reno(tp) && is_dupack)
2826 tcp_add_reno_sack(sk);
2827 } else {
2828 if (tcp_try_undo_partial(sk, acked))
2829 return;
2830
2831 do_lost = tcp_is_reno(tp) ||
2832 tcp_fackets_out(tp) > tp->reordering;
2833 }
2834 if (tcp_try_undo_dsack(sk)) {
2835 tcp_try_keep_open(sk);
2836 return;
2837 }
2838 break;
2839 case TCP_CA_Loss:
2840 tcp_process_loss(sk, flag, is_dupack, rexmit);
2841 if (icsk->icsk_ca_state != TCP_CA_Open &&
2842 !(flag & FLAG_LOST_RETRANS))
2843 return;
2844
2845 default:
2846 if (tcp_is_reno(tp)) {
2847 if (flag & FLAG_SND_UNA_ADVANCED)
2848 tcp_reset_reno_sack(tp);
2849 if (is_dupack)
2850 tcp_add_reno_sack(sk);
2851 }
2852
2853 if (icsk->icsk_ca_state <= TCP_CA_Disorder)
2854 tcp_try_undo_dsack(sk);
2855
2856 if (!tcp_time_to_recover(sk, flag)) {
2857 tcp_try_to_open(sk, flag);
2858 return;
2859 }
2860
2861
2862 if (icsk->icsk_ca_state < TCP_CA_CWR &&
2863 icsk->icsk_mtup.probe_size &&
2864 tp->snd_una == tp->mtu_probe.probe_seq_start) {
2865 tcp_mtup_probe_failed(sk);
2866
2867 tp->snd_cwnd++;
2868 tcp_simple_retransmit(sk);
2869 return;
2870 }
2871
2872
2873 tcp_enter_recovery(sk, (flag & FLAG_ECE));
2874 fast_rexmit = 1;
2875 }
2876
2877 if (do_lost)
2878 tcp_update_scoreboard(sk, fast_rexmit);
2879 *rexmit = REXMIT_LOST;
2880}
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
2901{
2902 const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
2903 struct rtt_meas *m = tcp_sk(sk)->rtt_min;
2904 struct rtt_meas rttm = {
2905 .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1),
2906 .ts = now,
2907 };
2908 u32 elapsed;
2909
2910
2911 if (unlikely(rttm.rtt <= m[0].rtt))
2912 m[0] = m[1] = m[2] = rttm;
2913 else if (rttm.rtt <= m[1].rtt)
2914 m[1] = m[2] = rttm;
2915 else if (rttm.rtt <= m[2].rtt)
2916 m[2] = rttm;
2917
2918 elapsed = now - m[0].ts;
2919 if (unlikely(elapsed > wlen)) {
2920
2921
2922
2923 m[0] = m[1];
2924 m[1] = m[2];
2925 m[2] = rttm;
2926 if (now - m[0].ts > wlen) {
2927 m[0] = m[1];
2928 m[1] = rttm;
2929 if (now - m[0].ts > wlen)
2930 m[0] = rttm;
2931 }
2932 } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
2933
2934
2935
2936 m[2] = m[1] = rttm;
2937 } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
2938
2939
2940
2941 m[2] = rttm;
2942 }
2943}
2944
2945static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2946 long seq_rtt_us, long sack_rtt_us,
2947 long ca_rtt_us)
2948{
2949 const struct tcp_sock *tp = tcp_sk(sk);
2950
2951
2952
2953
2954
2955
2956 if (seq_rtt_us < 0)
2957 seq_rtt_us = sack_rtt_us;
2958
2959
2960
2961
2962
2963
2964
2965 if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
2966 flag & FLAG_ACKED)
2967 seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp -
2968 tp->rx_opt.rcv_tsecr);
2969 if (seq_rtt_us < 0)
2970 return false;
2971
2972
2973
2974
2975
2976 tcp_update_rtt_min(sk, ca_rtt_us);
2977 tcp_rtt_estimator(sk, seq_rtt_us);
2978 tcp_set_rto(sk);
2979
2980
2981 inet_csk(sk)->icsk_backoff = 0;
2982 return true;
2983}
2984
2985
2986void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
2987{
2988 long rtt_us = -1L;
2989
2990 if (req && !req->num_retrans && tcp_rsk(req)->snt_synack.v64) {
2991 struct skb_mstamp now;
2992
2993 skb_mstamp_get(&now);
2994 rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack);
2995 }
2996
2997 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us);
2998}
2999
3000
3001static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
3002{
3003 const struct inet_connection_sock *icsk = inet_csk(sk);
3004
3005 icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
3006 tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
3007}
3008
3009
3010
3011
3012void tcp_rearm_rto(struct sock *sk)
3013{
3014 const struct inet_connection_sock *icsk = inet_csk(sk);
3015 struct tcp_sock *tp = tcp_sk(sk);
3016
3017
3018
3019
3020 if (tp->fastopen_rsk)
3021 return;
3022
3023 if (!tp->packets_out) {
3024 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
3025 } else {
3026 u32 rto = inet_csk(sk)->icsk_rto;
3027
3028 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
3029 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
3030 struct sk_buff *skb = tcp_write_queue_head(sk);
3031 const u32 rto_time_stamp =
3032 tcp_skb_timestamp(skb) + rto;
3033 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
3034
3035
3036
3037 if (delta > 0)
3038 rto = delta;
3039 }
3040 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
3041 TCP_RTO_MAX);
3042 }
3043}
3044
3045
3046
3047
3048void tcp_resume_early_retransmit(struct sock *sk)
3049{
3050 struct tcp_sock *tp = tcp_sk(sk);
3051
3052 tcp_rearm_rto(sk);
3053
3054
3055 if (!tp->do_early_retrans)
3056 return;
3057
3058 tcp_enter_recovery(sk, false);
3059 tcp_update_scoreboard(sk, 1);
3060 tcp_xmit_retransmit_queue(sk);
3061}
3062
3063
3064static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
3065{
3066 struct tcp_sock *tp = tcp_sk(sk);
3067 u32 packets_acked;
3068
3069 BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
3070
3071 packets_acked = tcp_skb_pcount(skb);
3072 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
3073 return 0;
3074 packets_acked -= tcp_skb_pcount(skb);
3075
3076 if (packets_acked) {
3077 BUG_ON(tcp_skb_pcount(skb) == 0);
3078 BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
3079 }
3080
3081 return packets_acked;
3082}
3083
3084static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
3085 u32 prior_snd_una)
3086{
3087 const struct skb_shared_info *shinfo;
3088
3089
3090 if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
3091 return;
3092
3093 shinfo = skb_shinfo(skb);
3094 if (!before(shinfo->tskey, prior_snd_una) &&
3095 before(shinfo->tskey, tcp_sk(sk)->snd_una))
3096 __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
3097}
3098
3099
3100
3101
3102
3103static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3104 u32 prior_snd_una, int *acked,
3105 struct tcp_sacktag_state *sack)
3106{
3107 const struct inet_connection_sock *icsk = inet_csk(sk);
3108 struct skb_mstamp first_ackt, last_ackt, now;
3109 struct tcp_sock *tp = tcp_sk(sk);
3110 u32 prior_sacked = tp->sacked_out;
3111 u32 reord = tp->packets_out;
3112 bool fully_acked = true;
3113 long sack_rtt_us = -1L;
3114 long seq_rtt_us = -1L;
3115 long ca_rtt_us = -1L;
3116 struct sk_buff *skb;
3117 u32 pkts_acked = 0;
3118 bool rtt_update;
3119 int flag = 0;
3120
3121 first_ackt.v64 = 0;
3122
3123 while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
3124 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
3125 u8 sacked = scb->sacked;
3126 u32 acked_pcount;
3127
3128 tcp_ack_tstamp(sk, skb, prior_snd_una);
3129
3130
3131 if (after(scb->end_seq, tp->snd_una)) {
3132 if (tcp_skb_pcount(skb) == 1 ||
3133 !after(tp->snd_una, scb->seq))
3134 break;
3135
3136 acked_pcount = tcp_tso_acked(sk, skb);
3137 if (!acked_pcount)
3138 break;
3139
3140 fully_acked = false;
3141 } else {
3142
3143 prefetchw(skb->next);
3144 acked_pcount = tcp_skb_pcount(skb);
3145 }
3146
3147 if (unlikely(sacked & TCPCB_RETRANS)) {
3148 if (sacked & TCPCB_SACKED_RETRANS)
3149 tp->retrans_out -= acked_pcount;
3150 flag |= FLAG_RETRANS_DATA_ACKED;
3151 } else if (!(sacked & TCPCB_SACKED_ACKED)) {
3152 last_ackt = skb->skb_mstamp;
3153 WARN_ON_ONCE(last_ackt.v64 == 0);
3154 if (!first_ackt.v64)
3155 first_ackt = last_ackt;
3156
3157 reord = min(pkts_acked, reord);
3158 if (!after(scb->end_seq, tp->high_seq))
3159 flag |= FLAG_ORIG_SACK_ACKED;
3160 }
3161
3162 if (sacked & TCPCB_SACKED_ACKED) {
3163 tp->sacked_out -= acked_pcount;
3164 } else if (tcp_is_sack(tp)) {
3165 tp->delivered += acked_pcount;
3166 if (!tcp_skb_spurious_retrans(tp, skb))
3167 tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
3168 }
3169 if (sacked & TCPCB_LOST)
3170 tp->lost_out -= acked_pcount;
3171
3172 tp->packets_out -= acked_pcount;
3173 pkts_acked += acked_pcount;
3174
3175
3176
3177
3178
3179
3180
3181
3182 if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
3183 flag |= FLAG_DATA_ACKED;
3184 } else {
3185 flag |= FLAG_SYN_ACKED;
3186 tp->retrans_stamp = 0;
3187 }
3188
3189 if (!fully_acked)
3190 break;
3191
3192 tcp_unlink_write_queue(skb, sk);
3193 sk_wmem_free_skb(sk, skb);
3194 if (unlikely(skb == tp->retransmit_skb_hint))
3195 tp->retransmit_skb_hint = NULL;
3196 if (unlikely(skb == tp->lost_skb_hint))
3197 tp->lost_skb_hint = NULL;
3198 }
3199
3200 if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
3201 tp->snd_up = tp->snd_una;
3202
3203 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
3204 flag |= FLAG_SACK_RENEGING;
3205
3206 skb_mstamp_get(&now);
3207 if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
3208 seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
3209 ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
3210 }
3211 if (sack->first_sackt.v64) {
3212 sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt);
3213 ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
3214 }
3215
3216 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
3217 ca_rtt_us);
3218
3219 if (flag & FLAG_ACKED) {
3220 tcp_rearm_rto(sk);
3221 if (unlikely(icsk->icsk_mtup.probe_size &&
3222 !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
3223 tcp_mtup_probe_success(sk);
3224 }
3225
3226 if (tcp_is_reno(tp)) {
3227 tcp_remove_reno_sacks(sk, pkts_acked);
3228 } else {
3229 int delta;
3230
3231
3232 if (reord < prior_fackets)
3233 tcp_update_reordering(sk, tp->fackets_out - reord, 0);
3234
3235 delta = tcp_is_fack(tp) ? pkts_acked :
3236 prior_sacked - tp->sacked_out;
3237 tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
3238 }
3239
3240 tp->fackets_out -= min(pkts_acked, tp->fackets_out);
3241
3242 } else if (skb && rtt_update && sack_rtt_us >= 0 &&
3243 sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
3244
3245
3246
3247
3248 tcp_rearm_rto(sk);
3249 }
3250
3251 if (icsk->icsk_ca_ops->pkts_acked) {
3252 struct ack_sample sample = { .pkts_acked = pkts_acked,
3253 .rtt_us = ca_rtt_us };
3254
3255 icsk->icsk_ca_ops->pkts_acked(sk, &sample);
3256 }
3257
3258#if FASTRETRANS_DEBUG > 0
3259 WARN_ON((int)tp->sacked_out < 0);
3260 WARN_ON((int)tp->lost_out < 0);
3261 WARN_ON((int)tp->retrans_out < 0);
3262 if (!tp->packets_out && tcp_is_sack(tp)) {
3263 icsk = inet_csk(sk);
3264 if (tp->lost_out) {
3265 pr_debug("Leak l=%u %d\n",
3266 tp->lost_out, icsk->icsk_ca_state);
3267 tp->lost_out = 0;
3268 }
3269 if (tp->sacked_out) {
3270 pr_debug("Leak s=%u %d\n",
3271 tp->sacked_out, icsk->icsk_ca_state);
3272 tp->sacked_out = 0;
3273 }
3274 if (tp->retrans_out) {
3275 pr_debug("Leak r=%u %d\n",
3276 tp->retrans_out, icsk->icsk_ca_state);
3277 tp->retrans_out = 0;
3278 }
3279 }
3280#endif
3281 *acked = pkts_acked;
3282 return flag;
3283}
3284
3285static void tcp_ack_probe(struct sock *sk)
3286{
3287 const struct tcp_sock *tp = tcp_sk(sk);
3288 struct inet_connection_sock *icsk = inet_csk(sk);
3289
3290
3291
3292 if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
3293 icsk->icsk_backoff = 0;
3294 inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
3295
3296
3297
3298 } else {
3299 unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
3300
3301 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3302 when, TCP_RTO_MAX);
3303 }
3304}
3305
3306static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
3307{
3308 return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
3309 inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
3310}
3311
3312
3313static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
3314{
3315
3316
3317
3318
3319
3320
3321 if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering)
3322 return flag & FLAG_FORWARD_PROGRESS;
3323
3324 return flag & FLAG_DATA_ACKED;
3325}
3326
3327
3328
3329
3330
3331
3332static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
3333 int flag)
3334{
3335 if (tcp_in_cwnd_reduction(sk)) {
3336
3337 tcp_cwnd_reduction(sk, acked_sacked, flag);
3338 } else if (tcp_may_raise_cwnd(sk, flag)) {
3339
3340 tcp_cong_avoid(sk, ack, acked_sacked);
3341 }
3342 tcp_update_pacing_rate(sk);
3343}
3344
3345
3346
3347
3348static inline bool tcp_may_update_window(const struct tcp_sock *tp,
3349 const u32 ack, const u32 ack_seq,
3350 const u32 nwin)
3351{
3352 return after(ack, tp->snd_una) ||
3353 after(ack_seq, tp->snd_wl1) ||
3354 (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
3355}
3356
3357
3358static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
3359{
3360 u32 delta = ack - tp->snd_una;
3361
3362 sock_owned_by_me((struct sock *)tp);
3363 u64_stats_update_begin_raw(&tp->syncp);
3364 tp->bytes_acked += delta;
3365 u64_stats_update_end_raw(&tp->syncp);
3366 tp->snd_una = ack;
3367}
3368
3369
3370static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
3371{
3372 u32 delta = seq - tp->rcv_nxt;
3373
3374 sock_owned_by_me((struct sock *)tp);
3375 u64_stats_update_begin_raw(&tp->syncp);
3376 tp->bytes_received += delta;
3377 u64_stats_update_end_raw(&tp->syncp);
3378 tp->rcv_nxt = seq;
3379}
3380
3381
3382
3383
3384
3385
3386static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack,
3387 u32 ack_seq)
3388{
3389 struct tcp_sock *tp = tcp_sk(sk);
3390 int flag = 0;
3391 u32 nwin = ntohs(tcp_hdr(skb)->window);
3392
3393 if (likely(!tcp_hdr(skb)->syn))
3394 nwin <<= tp->rx_opt.snd_wscale;
3395
3396 if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
3397 flag |= FLAG_WIN_UPDATE;
3398 tcp_update_wl(tp, ack_seq);
3399
3400 if (tp->snd_wnd != nwin) {
3401 tp->snd_wnd = nwin;
3402
3403
3404
3405
3406 tp->pred_flags = 0;
3407 tcp_fast_path_check(sk);
3408
3409 if (tcp_send_head(sk))
3410 tcp_slow_start_after_idle_check(sk);
3411
3412 if (nwin > tp->max_window) {
3413 tp->max_window = nwin;
3414 tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
3415 }
3416 }
3417 }
3418
3419 tcp_snd_una_update(tp, ack);
3420
3421 return flag;
3422}
3423
3424static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
3425 u32 *last_oow_ack_time)
3426{
3427 if (*last_oow_ack_time) {
3428 s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
3429
3430 if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
3431 NET_INC_STATS(net, mib_idx);
3432 return true;
3433 }
3434 }
3435
3436 *last_oow_ack_time = tcp_time_stamp;
3437
3438 return false;
3439}
3440
3441
3442
3443
3444
3445
3446
3447
3448bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
3449 int mib_idx, u32 *last_oow_ack_time)
3450{
3451
3452 if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
3453 !tcp_hdr(skb)->syn)
3454 return false;
3455
3456 return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
3457}
3458
3459
3460static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
3461{
3462
3463 static u32 challenge_timestamp;
3464 static unsigned int challenge_count;
3465 struct tcp_sock *tp = tcp_sk(sk);
3466 u32 count, now;
3467
3468
3469 if (__tcp_oow_rate_limited(sock_net(sk),
3470 LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
3471 &tp->last_oow_ack_time))
3472 return;
3473
3474
3475 now = jiffies / HZ;
3476 if (now != challenge_timestamp) {
3477 u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1;
3478
3479 challenge_timestamp = now;
3480 WRITE_ONCE(challenge_count, half +
3481 prandom_u32_max(sysctl_tcp_challenge_ack_limit));
3482 }
3483 count = READ_ONCE(challenge_count);
3484 if (count > 0) {
3485 WRITE_ONCE(challenge_count, count - 1);
3486 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
3487 tcp_send_ack(sk);
3488 }
3489}
3490
3491static void tcp_store_ts_recent(struct tcp_sock *tp)
3492{
3493 tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
3494 tp->rx_opt.ts_recent_stamp = get_seconds();
3495}
3496
3497static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
3498{
3499 if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
3500
3501
3502
3503
3504
3505
3506
3507 if (tcp_paws_check(&tp->rx_opt, 0))
3508 tcp_store_ts_recent(tp);
3509 }
3510}
3511
3512
3513
3514
3515
3516
3517static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
3518{
3519 struct tcp_sock *tp = tcp_sk(sk);
3520
3521 if (before(ack, tp->tlp_high_seq))
3522 return;
3523
3524 if (flag & FLAG_DSACKING_ACK) {
3525
3526 tp->tlp_high_seq = 0;
3527 } else if (after(ack, tp->tlp_high_seq)) {
3528
3529
3530
3531 tcp_init_cwnd_reduction(sk);
3532 tcp_set_ca_state(sk, TCP_CA_CWR);
3533 tcp_end_cwnd_reduction(sk);
3534 tcp_try_keep_open(sk);
3535 NET_INC_STATS(sock_net(sk),
3536 LINUX_MIB_TCPLOSSPROBERECOVERY);
3537 } else if (!(flag & (FLAG_SND_UNA_ADVANCED |
3538 FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
3539
3540 tp->tlp_high_seq = 0;
3541 }
3542}
3543
3544static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
3545{
3546 const struct inet_connection_sock *icsk = inet_csk(sk);
3547
3548 if (icsk->icsk_ca_ops->in_ack_event)
3549 icsk->icsk_ca_ops->in_ack_event(sk, flags);
3550}
3551
3552
3553
3554
3555
3556static void tcp_xmit_recovery(struct sock *sk, int rexmit)
3557{
3558 struct tcp_sock *tp = tcp_sk(sk);
3559
3560 if (rexmit == REXMIT_NONE)
3561 return;
3562
3563 if (unlikely(rexmit == 2)) {
3564 __tcp_push_pending_frames(sk, tcp_current_mss(sk),
3565 TCP_NAGLE_OFF);
3566 if (after(tp->snd_nxt, tp->high_seq))
3567 return;
3568 tp->frto = 0;
3569 }
3570 tcp_xmit_retransmit_queue(sk);
3571}
3572
3573
3574static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3575{
3576 struct inet_connection_sock *icsk = inet_csk(sk);
3577 struct tcp_sock *tp = tcp_sk(sk);
3578 struct tcp_sacktag_state sack_state;
3579 u32 prior_snd_una = tp->snd_una;
3580 u32 ack_seq = TCP_SKB_CB(skb)->seq;
3581 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3582 bool is_dupack = false;
3583 u32 prior_fackets;
3584 int prior_packets = tp->packets_out;
3585 u32 prior_delivered = tp->delivered;
3586 int acked = 0;
3587 int rexmit = REXMIT_NONE;
3588
3589 sack_state.first_sackt.v64 = 0;
3590
3591
3592 prefetchw(sk->sk_write_queue.next);
3593
3594
3595
3596
3597 if (before(ack, prior_snd_una)) {
3598
3599 if (before(ack, prior_snd_una - tp->max_window)) {
3600 tcp_send_challenge_ack(sk, skb);
3601 return -1;
3602 }
3603 goto old_ack;
3604 }
3605
3606
3607
3608
3609 if (after(ack, tp->snd_nxt))
3610 goto invalid_ack;
3611
3612 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
3613 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
3614 tcp_rearm_rto(sk);
3615
3616 if (after(ack, prior_snd_una)) {
3617 flag |= FLAG_SND_UNA_ADVANCED;
3618 icsk->icsk_retransmits = 0;
3619 }
3620
3621 prior_fackets = tp->fackets_out;
3622
3623
3624
3625
3626 if (flag & FLAG_UPDATE_TS_RECENT)
3627 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
3628
3629 if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
3630
3631
3632
3633
3634 tcp_update_wl(tp, ack_seq);
3635 tcp_snd_una_update(tp, ack);
3636 flag |= FLAG_WIN_UPDATE;
3637
3638 tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
3639
3640 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
3641 } else {
3642 u32 ack_ev_flags = CA_ACK_SLOWPATH;
3643
3644 if (ack_seq != TCP_SKB_CB(skb)->end_seq)
3645 flag |= FLAG_DATA;
3646 else
3647 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
3648
3649 flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
3650
3651 if (TCP_SKB_CB(skb)->sacked)
3652 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3653 &sack_state);
3654
3655 if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
3656 flag |= FLAG_ECE;
3657 ack_ev_flags |= CA_ACK_ECE;
3658 }
3659
3660 if (flag & FLAG_WIN_UPDATE)
3661 ack_ev_flags |= CA_ACK_WIN_UPDATE;
3662
3663 tcp_in_ack_event(sk, ack_ev_flags);
3664 }
3665
3666
3667
3668
3669 sk->sk_err_soft = 0;
3670 icsk->icsk_probes_out = 0;
3671 tp->rcv_tstamp = tcp_time_stamp;
3672 if (!prior_packets)
3673 goto no_queue;
3674
3675
3676 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
3677 &sack_state);
3678
3679 if (tcp_ack_is_dubious(sk, flag)) {
3680 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3681 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
3682 }
3683 if (tp->tlp_high_seq)
3684 tcp_process_tlp_ack(sk, ack, flag);
3685
3686 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
3687 struct dst_entry *dst = __sk_dst_get(sk);
3688 if (dst)
3689 dst_confirm(dst);
3690 }
3691
3692 if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3693 tcp_schedule_loss_probe(sk);
3694 tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag);
3695 tcp_xmit_recovery(sk, rexmit);
3696 return 1;
3697
3698no_queue:
3699
3700 if (flag & FLAG_DSACKING_ACK)
3701 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
3702
3703
3704
3705
3706 if (tcp_send_head(sk))
3707 tcp_ack_probe(sk);
3708
3709 if (tp->tlp_high_seq)
3710 tcp_process_tlp_ack(sk, ack, flag);
3711 return 1;
3712
3713invalid_ack:
3714 SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3715 return -1;
3716
3717old_ack:
3718
3719
3720
3721 if (TCP_SKB_CB(skb)->sacked) {
3722 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3723 &sack_state);
3724 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
3725 tcp_xmit_recovery(sk, rexmit);
3726 }
3727
3728 SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3729 return 0;
3730}
3731
3732static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
3733 bool syn, struct tcp_fastopen_cookie *foc,
3734 bool exp_opt)
3735{
3736
3737 if (!foc || !syn || len < 0 || (len & 1))
3738 return;
3739
3740 if (len >= TCP_FASTOPEN_COOKIE_MIN &&
3741 len <= TCP_FASTOPEN_COOKIE_MAX)
3742 memcpy(foc->val, cookie, len);
3743 else if (len != 0)
3744 len = -1;
3745 foc->len = len;
3746 foc->exp = exp_opt;
3747}
3748
3749
3750
3751
3752
3753void tcp_parse_options(const struct sk_buff *skb,
3754 struct tcp_options_received *opt_rx, int estab,
3755 struct tcp_fastopen_cookie *foc)
3756{
3757 const unsigned char *ptr;
3758 const struct tcphdr *th = tcp_hdr(skb);
3759 int length = (th->doff * 4) - sizeof(struct tcphdr);
3760
3761 ptr = (const unsigned char *)(th + 1);
3762 opt_rx->saw_tstamp = 0;
3763
3764 while (length > 0) {
3765 int opcode = *ptr++;
3766 int opsize;
3767
3768 switch (opcode) {
3769 case TCPOPT_EOL:
3770 return;
3771 case TCPOPT_NOP:
3772 length--;
3773 continue;
3774 default:
3775 opsize = *ptr++;
3776 if (opsize < 2)
3777 return;
3778 if (opsize > length)
3779 return;
3780 switch (opcode) {
3781 case TCPOPT_MSS:
3782 if (opsize == TCPOLEN_MSS && th->syn && !estab) {
3783 u16 in_mss = get_unaligned_be16(ptr);
3784 if (in_mss) {
3785 if (opt_rx->user_mss &&
3786 opt_rx->user_mss < in_mss)
3787 in_mss = opt_rx->user_mss;
3788 opt_rx->mss_clamp = in_mss;
3789 }
3790 }
3791 break;
3792 case TCPOPT_WINDOW:
3793 if (opsize == TCPOLEN_WINDOW && th->syn &&
3794 !estab && sysctl_tcp_window_scaling) {
3795 __u8 snd_wscale = *(__u8 *)ptr;
3796 opt_rx->wscale_ok = 1;
3797 if (snd_wscale > 14) {
3798 net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n",
3799 __func__,
3800 snd_wscale);
3801 snd_wscale = 14;
3802 }
3803 opt_rx->snd_wscale = snd_wscale;
3804 }
3805 break;
3806 case TCPOPT_TIMESTAMP:
3807 if ((opsize == TCPOLEN_TIMESTAMP) &&
3808 ((estab && opt_rx->tstamp_ok) ||
3809 (!estab && sysctl_tcp_timestamps))) {
3810 opt_rx->saw_tstamp = 1;
3811 opt_rx->rcv_tsval = get_unaligned_be32(ptr);
3812 opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
3813 }
3814 break;
3815 case TCPOPT_SACK_PERM:
3816 if (opsize == TCPOLEN_SACK_PERM && th->syn &&
3817 !estab && sysctl_tcp_sack) {
3818 opt_rx->sack_ok = TCP_SACK_SEEN;
3819 tcp_sack_reset(opt_rx);
3820 }
3821 break;
3822
3823 case TCPOPT_SACK:
3824 if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
3825 !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
3826 opt_rx->sack_ok) {
3827 TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
3828 }
3829 break;
3830#ifdef CONFIG_TCP_MD5SIG
3831 case TCPOPT_MD5SIG:
3832
3833
3834
3835
3836 break;
3837#endif
3838 case TCPOPT_FASTOPEN:
3839 tcp_parse_fastopen_option(
3840 opsize - TCPOLEN_FASTOPEN_BASE,
3841 ptr, th->syn, foc, false);
3842 break;
3843
3844 case TCPOPT_EXP:
3845
3846
3847
3848 if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
3849 get_unaligned_be16(ptr) ==
3850 TCPOPT_FASTOPEN_MAGIC)
3851 tcp_parse_fastopen_option(opsize -
3852 TCPOLEN_EXP_FASTOPEN_BASE,
3853 ptr + 2, th->syn, foc, true);
3854 break;
3855
3856 }
3857 ptr += opsize-2;
3858 length -= opsize;
3859 }
3860 }
3861}
3862EXPORT_SYMBOL(tcp_parse_options);
3863
3864static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
3865{
3866 const __be32 *ptr = (const __be32 *)(th + 1);
3867
3868 if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
3869 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
3870 tp->rx_opt.saw_tstamp = 1;
3871 ++ptr;
3872 tp->rx_opt.rcv_tsval = ntohl(*ptr);
3873 ++ptr;
3874 if (*ptr)
3875 tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
3876 else
3877 tp->rx_opt.rcv_tsecr = 0;
3878 return true;
3879 }
3880 return false;
3881}
3882
3883
3884
3885
3886static bool tcp_fast_parse_options(const struct sk_buff *skb,
3887 const struct tcphdr *th, struct tcp_sock *tp)
3888{
3889
3890
3891
3892 if (th->doff == (sizeof(*th) / 4)) {
3893 tp->rx_opt.saw_tstamp = 0;
3894 return false;
3895 } else if (tp->rx_opt.tstamp_ok &&
3896 th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
3897 if (tcp_parse_aligned_timestamp(tp, th))
3898 return true;
3899 }
3900
3901 tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
3902 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
3903 tp->rx_opt.rcv_tsecr -= tp->tsoffset;
3904
3905 return true;
3906}
3907
3908#ifdef CONFIG_TCP_MD5SIG
3909
3910
3911
3912const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
3913{
3914 int length = (th->doff << 2) - sizeof(*th);
3915 const u8 *ptr = (const u8 *)(th + 1);
3916
3917
3918 if (length < TCPOLEN_MD5SIG)
3919 return NULL;
3920
3921 while (length > 0) {
3922 int opcode = *ptr++;
3923 int opsize;
3924
3925 switch (opcode) {
3926 case TCPOPT_EOL:
3927 return NULL;
3928 case TCPOPT_NOP:
3929 length--;
3930 continue;
3931 default:
3932 opsize = *ptr++;
3933 if (opsize < 2 || opsize > length)
3934 return NULL;
3935 if (opcode == TCPOPT_MD5SIG)
3936 return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
3937 }
3938 ptr += opsize - 2;
3939 length -= opsize;
3940 }
3941 return NULL;
3942}
3943EXPORT_SYMBOL(tcp_parse_md5sig_option);
3944#endif
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
3970{
3971 const struct tcp_sock *tp = tcp_sk(sk);
3972 const struct tcphdr *th = tcp_hdr(skb);
3973 u32 seq = TCP_SKB_CB(skb)->seq;
3974 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3975
3976 return (
3977 (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
3978
3979
3980 ack == tp->snd_una &&
3981
3982
3983 !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
3984
3985
3986 (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
3987}
3988
3989static inline bool tcp_paws_discard(const struct sock *sk,
3990 const struct sk_buff *skb)
3991{
3992 const struct tcp_sock *tp = tcp_sk(sk);
3993
3994 return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
3995 !tcp_disordered_ack(sk, skb);
3996}
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
4012{
4013 return !before(end_seq, tp->rcv_wup) &&
4014 !after(seq, tp->rcv_nxt + tcp_receive_window(tp));
4015}
4016
4017
4018void tcp_reset(struct sock *sk)
4019{
4020
4021 switch (sk->sk_state) {
4022 case TCP_SYN_SENT:
4023 sk->sk_err = ECONNREFUSED;
4024 break;
4025 case TCP_CLOSE_WAIT:
4026 sk->sk_err = EPIPE;
4027 break;
4028 case TCP_CLOSE:
4029 return;
4030 default:
4031 sk->sk_err = ECONNRESET;
4032 }
4033
4034 smp_wmb();
4035
4036 if (!sock_flag(sk, SOCK_DEAD))
4037 sk->sk_error_report(sk);
4038
4039 tcp_done(sk);
4040}
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056void tcp_fin(struct sock *sk)
4057{
4058 struct tcp_sock *tp = tcp_sk(sk);
4059
4060 inet_csk_schedule_ack(sk);
4061
4062 sk->sk_shutdown |= RCV_SHUTDOWN;
4063 sock_set_flag(sk, SOCK_DONE);
4064
4065 switch (sk->sk_state) {
4066 case TCP_SYN_RECV:
4067 case TCP_ESTABLISHED:
4068
4069 tcp_set_state(sk, TCP_CLOSE_WAIT);
4070 inet_csk(sk)->icsk_ack.pingpong = 1;
4071 break;
4072
4073 case TCP_CLOSE_WAIT:
4074 case TCP_CLOSING:
4075
4076
4077
4078 break;
4079 case TCP_LAST_ACK:
4080
4081 break;
4082
4083 case TCP_FIN_WAIT1:
4084
4085
4086
4087
4088 tcp_send_ack(sk);
4089 tcp_set_state(sk, TCP_CLOSING);
4090 break;
4091 case TCP_FIN_WAIT2:
4092
4093 tcp_send_ack(sk);
4094 tcp_time_wait(sk, TCP_TIME_WAIT, 0);
4095 break;
4096 default:
4097
4098
4099
4100 pr_err("%s: Impossible, sk->sk_state=%d\n",
4101 __func__, sk->sk_state);
4102 break;
4103 }
4104
4105
4106
4107
4108 __skb_queue_purge(&tp->out_of_order_queue);
4109 if (tcp_is_sack(tp))
4110 tcp_sack_reset(&tp->rx_opt);
4111 sk_mem_reclaim(sk);
4112
4113 if (!sock_flag(sk, SOCK_DEAD)) {
4114 sk->sk_state_change(sk);
4115
4116
4117 if (sk->sk_shutdown == SHUTDOWN_MASK ||
4118 sk->sk_state == TCP_CLOSE)
4119 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
4120 else
4121 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
4122 }
4123}
4124
4125static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
4126 u32 end_seq)
4127{
4128 if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
4129 if (before(seq, sp->start_seq))
4130 sp->start_seq = seq;
4131 if (after(end_seq, sp->end_seq))
4132 sp->end_seq = end_seq;
4133 return true;
4134 }
4135 return false;
4136}
4137
4138static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
4139{
4140 struct tcp_sock *tp = tcp_sk(sk);
4141
4142 if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
4143 int mib_idx;
4144
4145 if (before(seq, tp->rcv_nxt))
4146 mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
4147 else
4148 mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
4149
4150 NET_INC_STATS(sock_net(sk), mib_idx);
4151
4152 tp->rx_opt.dsack = 1;
4153 tp->duplicate_sack[0].start_seq = seq;
4154 tp->duplicate_sack[0].end_seq = end_seq;
4155 }
4156}
4157
4158static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
4159{
4160 struct tcp_sock *tp = tcp_sk(sk);
4161
4162 if (!tp->rx_opt.dsack)
4163 tcp_dsack_set(sk, seq, end_seq);
4164 else
4165 tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
4166}
4167
4168static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
4169{
4170 struct tcp_sock *tp = tcp_sk(sk);
4171
4172 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
4173 before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4174 NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4175 tcp_enter_quickack_mode(sk);
4176
4177 if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
4178 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
4179
4180 if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
4181 end_seq = tp->rcv_nxt;
4182 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
4183 }
4184 }
4185
4186 tcp_send_ack(sk);
4187}
4188
4189
4190
4191
4192static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
4193{
4194 int this_sack;
4195 struct tcp_sack_block *sp = &tp->selective_acks[0];
4196 struct tcp_sack_block *swalk = sp + 1;
4197
4198
4199
4200
4201 for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
4202 if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
4203 int i;
4204
4205
4206
4207
4208 tp->rx_opt.num_sacks--;
4209 for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
4210 sp[i] = sp[i + 1];
4211 continue;
4212 }
4213 this_sack++, swalk++;
4214 }
4215}
4216
4217static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
4218{
4219 struct tcp_sock *tp = tcp_sk(sk);
4220 struct tcp_sack_block *sp = &tp->selective_acks[0];
4221 int cur_sacks = tp->rx_opt.num_sacks;
4222 int this_sack;
4223
4224 if (!cur_sacks)
4225 goto new_sack;
4226
4227 for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
4228 if (tcp_sack_extend(sp, seq, end_seq)) {
4229
4230 for (; this_sack > 0; this_sack--, sp--)
4231 swap(*sp, *(sp - 1));
4232 if (cur_sacks > 1)
4233 tcp_sack_maybe_coalesce(tp);
4234 return;
4235 }
4236 }
4237
4238
4239
4240
4241
4242
4243
4244 if (this_sack >= TCP_NUM_SACKS) {
4245 this_sack--;
4246 tp->rx_opt.num_sacks--;
4247 sp--;
4248 }
4249 for (; this_sack > 0; this_sack--, sp--)
4250 *sp = *(sp - 1);
4251
4252new_sack:
4253
4254 sp->start_seq = seq;
4255 sp->end_seq = end_seq;
4256 tp->rx_opt.num_sacks++;
4257}
4258
4259
4260
4261static void tcp_sack_remove(struct tcp_sock *tp)
4262{
4263 struct tcp_sack_block *sp = &tp->selective_acks[0];
4264 int num_sacks = tp->rx_opt.num_sacks;
4265 int this_sack;
4266
4267
4268 if (skb_queue_empty(&tp->out_of_order_queue)) {
4269 tp->rx_opt.num_sacks = 0;
4270 return;
4271 }
4272
4273 for (this_sack = 0; this_sack < num_sacks;) {
4274
4275 if (!before(tp->rcv_nxt, sp->start_seq)) {
4276 int i;
4277
4278
4279 WARN_ON(before(tp->rcv_nxt, sp->end_seq));
4280
4281
4282 for (i = this_sack+1; i < num_sacks; i++)
4283 tp->selective_acks[i-1] = tp->selective_acks[i];
4284 num_sacks--;
4285 continue;
4286 }
4287 this_sack++;
4288 sp++;
4289 }
4290 tp->rx_opt.num_sacks = num_sacks;
4291}
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306static bool tcp_try_coalesce(struct sock *sk,
4307 struct sk_buff *to,
4308 struct sk_buff *from,
4309 bool *fragstolen)
4310{
4311 int delta;
4312
4313 *fragstolen = false;
4314
4315
4316 if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
4317 return false;
4318
4319 if (!skb_try_coalesce(to, from, fragstolen, &delta))
4320 return false;
4321
4322 atomic_add(delta, &sk->sk_rmem_alloc);
4323 sk_mem_charge(sk, delta);
4324 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
4325 TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
4326 TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
4327 TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
4328 return true;
4329}
4330
4331static void tcp_drop(struct sock *sk, struct sk_buff *skb)
4332{
4333 sk_drops_add(sk, skb);
4334 __kfree_skb(skb);
4335}
4336
4337
4338
4339
4340static void tcp_ofo_queue(struct sock *sk)
4341{
4342 struct tcp_sock *tp = tcp_sk(sk);
4343 __u32 dsack_high = tp->rcv_nxt;
4344 struct sk_buff *skb, *tail;
4345 bool fragstolen, eaten;
4346
4347 while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
4348 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
4349 break;
4350
4351 if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
4352 __u32 dsack = dsack_high;
4353 if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
4354 dsack_high = TCP_SKB_CB(skb)->end_seq;
4355 tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
4356 }
4357
4358 __skb_unlink(skb, &tp->out_of_order_queue);
4359 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
4360 SOCK_DEBUG(sk, "ofo packet was already received\n");
4361 tcp_drop(sk, skb);
4362 continue;
4363 }
4364 SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
4365 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
4366 TCP_SKB_CB(skb)->end_seq);
4367
4368 tail = skb_peek_tail(&sk->sk_receive_queue);
4369 eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
4370 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
4371 if (!eaten)
4372 __skb_queue_tail(&sk->sk_receive_queue, skb);
4373 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
4374 tcp_fin(sk);
4375 if (eaten)
4376 kfree_skb_partial(skb, fragstolen);
4377 }
4378}
4379
4380static bool tcp_prune_ofo_queue(struct sock *sk);
4381static int tcp_prune_queue(struct sock *sk);
4382
4383static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4384 unsigned int size)
4385{
4386 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
4387 !sk_rmem_schedule(sk, skb, size)) {
4388
4389 if (tcp_prune_queue(sk) < 0)
4390 return -1;
4391
4392 if (!sk_rmem_schedule(sk, skb, size)) {
4393 if (!tcp_prune_ofo_queue(sk))
4394 return -1;
4395
4396 if (!sk_rmem_schedule(sk, skb, size))
4397 return -1;
4398 }
4399 }
4400 return 0;
4401}
4402
4403static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4404{
4405 struct tcp_sock *tp = tcp_sk(sk);
4406 struct sk_buff *skb1;
4407 u32 seq, end_seq;
4408
4409 tcp_ecn_check_ce(tp, skb);
4410
4411 if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
4412 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
4413 tcp_drop(sk, skb);
4414 return;
4415 }
4416
4417
4418 tp->pred_flags = 0;
4419 inet_csk_schedule_ack(sk);
4420
4421 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
4422 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
4423 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
4424
4425 skb1 = skb_peek_tail(&tp->out_of_order_queue);
4426 if (!skb1) {
4427
4428 if (tcp_is_sack(tp)) {
4429 tp->rx_opt.num_sacks = 1;
4430 tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
4431 tp->selective_acks[0].end_seq =
4432 TCP_SKB_CB(skb)->end_seq;
4433 }
4434 __skb_queue_head(&tp->out_of_order_queue, skb);
4435 goto end;
4436 }
4437
4438 seq = TCP_SKB_CB(skb)->seq;
4439 end_seq = TCP_SKB_CB(skb)->end_seq;
4440
4441 if (seq == TCP_SKB_CB(skb1)->end_seq) {
4442 bool fragstolen;
4443
4444 if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
4445 __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
4446 } else {
4447 tcp_grow_window(sk, skb);
4448 kfree_skb_partial(skb, fragstolen);
4449 skb = NULL;
4450 }
4451
4452 if (!tp->rx_opt.num_sacks ||
4453 tp->selective_acks[0].end_seq != seq)
4454 goto add_sack;
4455
4456
4457 tp->selective_acks[0].end_seq = end_seq;
4458 goto end;
4459 }
4460
4461
4462 while (1) {
4463 if (!after(TCP_SKB_CB(skb1)->seq, seq))
4464 break;
4465 if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
4466 skb1 = NULL;
4467 break;
4468 }
4469 skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
4470 }
4471
4472
4473 if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
4474 if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
4475
4476 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
4477 tcp_drop(sk, skb);
4478 skb = NULL;
4479 tcp_dsack_set(sk, seq, end_seq);
4480 goto add_sack;
4481 }
4482 if (after(seq, TCP_SKB_CB(skb1)->seq)) {
4483
4484 tcp_dsack_set(sk, seq,
4485 TCP_SKB_CB(skb1)->end_seq);
4486 } else {
4487 if (skb_queue_is_first(&tp->out_of_order_queue,
4488 skb1))
4489 skb1 = NULL;
4490 else
4491 skb1 = skb_queue_prev(
4492 &tp->out_of_order_queue,
4493 skb1);
4494 }
4495 }
4496 if (!skb1)
4497 __skb_queue_head(&tp->out_of_order_queue, skb);
4498 else
4499 __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
4500
4501
4502 while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
4503 skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
4504
4505 if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
4506 break;
4507 if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
4508 tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
4509 end_seq);
4510 break;
4511 }
4512 __skb_unlink(skb1, &tp->out_of_order_queue);
4513 tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
4514 TCP_SKB_CB(skb1)->end_seq);
4515 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
4516 tcp_drop(sk, skb1);
4517 }
4518
4519add_sack:
4520 if (tcp_is_sack(tp))
4521 tcp_sack_new_ofo_skb(sk, seq, end_seq);
4522end:
4523 if (skb) {
4524 tcp_grow_window(sk, skb);
4525 skb_set_owner_r(skb, sk);
4526 }
4527}
4528
4529static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
4530 bool *fragstolen)
4531{
4532 int eaten;
4533 struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
4534
4535 __skb_pull(skb, hdrlen);
4536 eaten = (tail &&
4537 tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
4538 tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
4539 if (!eaten) {
4540 __skb_queue_tail(&sk->sk_receive_queue, skb);
4541 skb_set_owner_r(skb, sk);
4542 }
4543 return eaten;
4544}
4545
4546int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
4547{
4548 struct sk_buff *skb;
4549 int err = -ENOMEM;
4550 int data_len = 0;
4551 bool fragstolen;
4552
4553 if (size == 0)
4554 return 0;
4555
4556 if (size > PAGE_SIZE) {
4557 int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);
4558
4559 data_len = npages << PAGE_SHIFT;
4560 size = data_len + (size & ~PAGE_MASK);
4561 }
4562 skb = alloc_skb_with_frags(size - data_len, data_len,
4563 PAGE_ALLOC_COSTLY_ORDER,
4564 &err, sk->sk_allocation);
4565 if (!skb)
4566 goto err;
4567
4568 skb_put(skb, size - data_len);
4569 skb->data_len = data_len;
4570 skb->len = size;
4571
4572 if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
4573 goto err_free;
4574
4575 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
4576 if (err)
4577 goto err_free;
4578
4579 TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
4580 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
4581 TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
4582
4583 if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
4584 WARN_ON_ONCE(fragstolen);
4585 __kfree_skb(skb);
4586 }
4587 return size;
4588
4589err_free:
4590 kfree_skb(skb);
4591err:
4592 return err;
4593
4594}
4595
4596static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4597{
4598 struct tcp_sock *tp = tcp_sk(sk);
4599 bool fragstolen = false;
4600 int eaten = -1;
4601
4602 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
4603 __kfree_skb(skb);
4604 return;
4605 }
4606 skb_dst_drop(skb);
4607 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
4608
4609 tcp_ecn_accept_cwr(tp, skb);
4610
4611 tp->rx_opt.dsack = 0;
4612
4613
4614
4615
4616
4617 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
4618 if (tcp_receive_window(tp) == 0)
4619 goto out_of_window;
4620
4621
4622 if (tp->ucopy.task == current &&
4623 tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
4624 sock_owned_by_user(sk) && !tp->urg_data) {
4625 int chunk = min_t(unsigned int, skb->len,
4626 tp->ucopy.len);
4627
4628 __set_current_state(TASK_RUNNING);
4629
4630 if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) {
4631 tp->ucopy.len -= chunk;
4632 tp->copied_seq += chunk;
4633 eaten = (chunk == skb->len);
4634 tcp_rcv_space_adjust(sk);
4635 }
4636 }
4637
4638 if (eaten <= 0) {
4639queue_and_out:
4640 if (eaten < 0) {
4641 if (skb_queue_len(&sk->sk_receive_queue) == 0)
4642 sk_forced_mem_schedule(sk, skb->truesize);
4643 else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
4644 goto drop;
4645 }
4646 eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
4647 }
4648 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
4649 if (skb->len)
4650 tcp_event_data_recv(sk, skb);
4651 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
4652 tcp_fin(sk);
4653
4654 if (!skb_queue_empty(&tp->out_of_order_queue)) {
4655 tcp_ofo_queue(sk);
4656
4657
4658
4659
4660 if (skb_queue_empty(&tp->out_of_order_queue))
4661 inet_csk(sk)->icsk_ack.pingpong = 0;
4662 }
4663
4664 if (tp->rx_opt.num_sacks)
4665 tcp_sack_remove(tp);
4666
4667 tcp_fast_path_check(sk);
4668
4669 if (eaten > 0)
4670 kfree_skb_partial(skb, fragstolen);
4671 if (!sock_flag(sk, SOCK_DEAD))
4672 sk->sk_data_ready(sk);
4673 return;
4674 }
4675
4676 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
4677
4678 NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4679 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
4680
4681out_of_window:
4682 tcp_enter_quickack_mode(sk);
4683 inet_csk_schedule_ack(sk);
4684drop:
4685 tcp_drop(sk, skb);
4686 return;
4687 }
4688
4689
4690 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
4691 goto out_of_window;
4692
4693 tcp_enter_quickack_mode(sk);
4694
4695 if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4696
4697 SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
4698 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
4699 TCP_SKB_CB(skb)->end_seq);
4700
4701 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
4702
4703
4704
4705
4706 if (!tcp_receive_window(tp))
4707 goto out_of_window;
4708 goto queue_and_out;
4709 }
4710
4711 tcp_data_queue_ofo(sk, skb);
4712}
4713
4714static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
4715 struct sk_buff_head *list)
4716{
4717 struct sk_buff *next = NULL;
4718
4719 if (!skb_queue_is_last(list, skb))
4720 next = skb_queue_next(list, skb);
4721
4722 __skb_unlink(skb, list);
4723 __kfree_skb(skb);
4724 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
4725
4726 return next;
4727}
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737static void
4738tcp_collapse(struct sock *sk, struct sk_buff_head *list,
4739 struct sk_buff *head, struct sk_buff *tail,
4740 u32 start, u32 end)
4741{
4742 struct sk_buff *skb, *n;
4743 bool end_of_skbs;
4744
4745
4746
4747 skb = head;
4748restart:
4749 end_of_skbs = true;
4750 skb_queue_walk_from_safe(list, skb, n) {
4751 if (skb == tail)
4752 break;
4753
4754 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
4755 skb = tcp_collapse_one(sk, skb, list);
4756 if (!skb)
4757 break;
4758 goto restart;
4759 }
4760
4761
4762
4763
4764
4765
4766 if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
4767 (tcp_win_from_space(skb->truesize) > skb->len ||
4768 before(TCP_SKB_CB(skb)->seq, start))) {
4769 end_of_skbs = false;
4770 break;
4771 }
4772
4773 if (!skb_queue_is_last(list, skb)) {
4774 struct sk_buff *next = skb_queue_next(list, skb);
4775 if (next != tail &&
4776 TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
4777 end_of_skbs = false;
4778 break;
4779 }
4780 }
4781
4782
4783 start = TCP_SKB_CB(skb)->end_seq;
4784 }
4785 if (end_of_skbs ||
4786 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4787 return;
4788
4789 while (before(start, end)) {
4790 int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
4791 struct sk_buff *nskb;
4792
4793 nskb = alloc_skb(copy, GFP_ATOMIC);
4794 if (!nskb)
4795 return;
4796
4797 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
4798 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
4799 __skb_queue_before(list, skb, nskb);
4800 skb_set_owner_r(nskb, sk);
4801
4802
4803 while (copy > 0) {
4804 int offset = start - TCP_SKB_CB(skb)->seq;
4805 int size = TCP_SKB_CB(skb)->end_seq - start;
4806
4807 BUG_ON(offset < 0);
4808 if (size > 0) {
4809 size = min(copy, size);
4810 if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
4811 BUG();
4812 TCP_SKB_CB(nskb)->end_seq += size;
4813 copy -= size;
4814 start += size;
4815 }
4816 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
4817 skb = tcp_collapse_one(sk, skb, list);
4818 if (!skb ||
4819 skb == tail ||
4820 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4821 return;
4822 }
4823 }
4824 }
4825}
4826
4827
4828
4829
4830static void tcp_collapse_ofo_queue(struct sock *sk)
4831{
4832 struct tcp_sock *tp = tcp_sk(sk);
4833 struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
4834 struct sk_buff *head;
4835 u32 start, end;
4836
4837 if (!skb)
4838 return;
4839
4840 start = TCP_SKB_CB(skb)->seq;
4841 end = TCP_SKB_CB(skb)->end_seq;
4842 head = skb;
4843
4844 for (;;) {
4845 struct sk_buff *next = NULL;
4846
4847 if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
4848 next = skb_queue_next(&tp->out_of_order_queue, skb);
4849 skb = next;
4850
4851
4852
4853 if (!skb ||
4854 after(TCP_SKB_CB(skb)->seq, end) ||
4855 before(TCP_SKB_CB(skb)->end_seq, start)) {
4856 tcp_collapse(sk, &tp->out_of_order_queue,
4857 head, skb, start, end);
4858 head = skb;
4859 if (!skb)
4860 break;
4861
4862 start = TCP_SKB_CB(skb)->seq;
4863 end = TCP_SKB_CB(skb)->end_seq;
4864 } else {
4865 if (before(TCP_SKB_CB(skb)->seq, start))
4866 start = TCP_SKB_CB(skb)->seq;
4867 if (after(TCP_SKB_CB(skb)->end_seq, end))
4868 end = TCP_SKB_CB(skb)->end_seq;
4869 }
4870 }
4871}
4872
4873
4874
4875
4876
4877static bool tcp_prune_ofo_queue(struct sock *sk)
4878{
4879 struct tcp_sock *tp = tcp_sk(sk);
4880 bool res = false;
4881
4882 if (!skb_queue_empty(&tp->out_of_order_queue)) {
4883 NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
4884 __skb_queue_purge(&tp->out_of_order_queue);
4885
4886
4887
4888
4889
4890
4891 if (tp->rx_opt.sack_ok)
4892 tcp_sack_reset(&tp->rx_opt);
4893 sk_mem_reclaim(sk);
4894 res = true;
4895 }
4896 return res;
4897}
4898
4899
4900
4901
4902
4903
4904
4905
4906static int tcp_prune_queue(struct sock *sk)
4907{
4908 struct tcp_sock *tp = tcp_sk(sk);
4909
4910 SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
4911
4912 NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
4913
4914 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
4915 tcp_clamp_window(sk);
4916 else if (tcp_under_memory_pressure(sk))
4917 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
4918
4919 tcp_collapse_ofo_queue(sk);
4920 if (!skb_queue_empty(&sk->sk_receive_queue))
4921 tcp_collapse(sk, &sk->sk_receive_queue,
4922 skb_peek(&sk->sk_receive_queue),
4923 NULL,
4924 tp->copied_seq, tp->rcv_nxt);
4925 sk_mem_reclaim(sk);
4926
4927 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
4928 return 0;
4929
4930
4931
4932
4933 tcp_prune_ofo_queue(sk);
4934
4935 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
4936 return 0;
4937
4938
4939
4940
4941
4942 NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
4943
4944
4945 tp->pred_flags = 0;
4946 return -1;
4947}
4948
4949static bool tcp_should_expand_sndbuf(const struct sock *sk)
4950{
4951 const struct tcp_sock *tp = tcp_sk(sk);
4952
4953
4954
4955
4956 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
4957 return false;
4958
4959
4960 if (tcp_under_memory_pressure(sk))
4961 return false;
4962
4963
4964 if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
4965 return false;
4966
4967
4968 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
4969 return false;
4970
4971 return true;
4972}
4973
4974
4975
4976
4977
4978
4979
4980static void tcp_new_space(struct sock *sk)
4981{
4982 struct tcp_sock *tp = tcp_sk(sk);
4983
4984 if (tcp_should_expand_sndbuf(sk)) {
4985 tcp_sndbuf_expand(sk);
4986 tp->snd_cwnd_stamp = tcp_time_stamp;
4987 }
4988
4989 sk->sk_write_space(sk);
4990}
4991
4992static void tcp_check_space(struct sock *sk)
4993{
4994 if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
4995 sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
4996
4997 smp_mb__after_atomic();
4998 if (sk->sk_socket &&
4999 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
5000 tcp_new_space(sk);
5001 }
5002}
5003
5004static inline void tcp_data_snd_check(struct sock *sk)
5005{
5006 tcp_push_pending_frames(sk);
5007 tcp_check_space(sk);
5008}
5009
5010
5011
5012
5013static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
5014{
5015 struct tcp_sock *tp = tcp_sk(sk);
5016
5017
5018 if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
5019
5020
5021
5022 __tcp_select_window(sk) >= tp->rcv_wnd) ||
5023
5024 tcp_in_quickack_mode(sk) ||
5025
5026 (ofo_possible && skb_peek(&tp->out_of_order_queue))) {
5027
5028 tcp_send_ack(sk);
5029 } else {
5030
5031 tcp_send_delayed_ack(sk);
5032 }
5033}
5034
5035static inline void tcp_ack_snd_check(struct sock *sk)
5036{
5037 if (!inet_csk_ack_scheduled(sk)) {
5038
5039 return;
5040 }
5041 __tcp_ack_snd_check(sk, 1);
5042}
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
5055{
5056 struct tcp_sock *tp = tcp_sk(sk);
5057 u32 ptr = ntohs(th->urg_ptr);
5058
5059 if (ptr && !sysctl_tcp_stdurg)
5060 ptr--;
5061 ptr += ntohl(th->seq);
5062
5063
5064 if (after(tp->copied_seq, ptr))
5065 return;
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077 if (before(ptr, tp->rcv_nxt))
5078 return;
5079
5080
5081 if (tp->urg_data && !after(ptr, tp->urg_seq))
5082 return;
5083
5084
5085 sk_send_sigurg(sk);
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102 if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
5103 !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
5104 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
5105 tp->copied_seq++;
5106 if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
5107 __skb_unlink(skb, &sk->sk_receive_queue);
5108 __kfree_skb(skb);
5109 }
5110 }
5111
5112 tp->urg_data = TCP_URG_NOTYET;
5113 tp->urg_seq = ptr;
5114
5115
5116 tp->pred_flags = 0;
5117}
5118
5119
5120static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
5121{
5122 struct tcp_sock *tp = tcp_sk(sk);
5123
5124
5125 if (th->urg)
5126 tcp_check_urg(sk, th);
5127
5128
5129 if (tp->urg_data == TCP_URG_NOTYET) {
5130 u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
5131 th->syn;
5132
5133
5134 if (ptr < skb->len) {
5135 u8 tmp;
5136 if (skb_copy_bits(skb, ptr, &tmp, 1))
5137 BUG();
5138 tp->urg_data = TCP_URG_VALID | tmp;
5139 if (!sock_flag(sk, SOCK_DEAD))
5140 sk->sk_data_ready(sk);
5141 }
5142 }
5143}
5144
5145static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
5146{
5147 struct tcp_sock *tp = tcp_sk(sk);
5148 int chunk = skb->len - hlen;
5149 int err;
5150
5151 if (skb_csum_unnecessary(skb))
5152 err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk);
5153 else
5154 err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg);
5155
5156 if (!err) {
5157 tp->ucopy.len -= chunk;
5158 tp->copied_seq += chunk;
5159 tcp_rcv_space_adjust(sk);
5160 }
5161
5162 return err;
5163}
5164
5165
5166
5167
5168static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
5169 const struct tcphdr *th, int syn_inerr)
5170{
5171 struct tcp_sock *tp = tcp_sk(sk);
5172
5173
5174 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
5175 tcp_paws_discard(sk, skb)) {
5176 if (!th->rst) {
5177 NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
5178 if (!tcp_oow_rate_limited(sock_net(sk), skb,
5179 LINUX_MIB_TCPACKSKIPPEDPAWS,
5180 &tp->last_oow_ack_time))
5181 tcp_send_dupack(sk, skb);
5182 goto discard;
5183 }
5184
5185 }
5186
5187
5188 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
5189
5190
5191
5192
5193
5194
5195 if (!th->rst) {
5196 if (th->syn)
5197 goto syn_challenge;
5198 if (!tcp_oow_rate_limited(sock_net(sk), skb,
5199 LINUX_MIB_TCPACKSKIPPEDSEQ,
5200 &tp->last_oow_ack_time))
5201 tcp_send_dupack(sk, skb);
5202 }
5203 goto discard;
5204 }
5205
5206
5207 if (th->rst) {
5208
5209
5210
5211
5212
5213
5214 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
5215 tcp_reset(sk);
5216 else
5217 tcp_send_challenge_ack(sk, skb);
5218 goto discard;
5219 }
5220
5221
5222
5223
5224
5225
5226 if (th->syn) {
5227syn_challenge:
5228 if (syn_inerr)
5229 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
5230 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
5231 tcp_send_challenge_ack(sk, skb);
5232 goto discard;
5233 }
5234
5235 return true;
5236
5237discard:
5238 tcp_drop(sk, skb);
5239 return false;
5240}
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5266 const struct tcphdr *th, unsigned int len)
5267{
5268 struct tcp_sock *tp = tcp_sk(sk);
5269
5270 if (unlikely(!sk->sk_rx_dst))
5271 inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287 tp->rx_opt.saw_tstamp = 0;
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298 if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
5299 TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
5300 !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
5301 int tcp_header_len = tp->tcp_header_len;
5302
5303
5304
5305
5306
5307
5308
5309 if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
5310
5311 if (!tcp_parse_aligned_timestamp(tp, th))
5312 goto slow_path;
5313
5314
5315 if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
5316 goto slow_path;
5317
5318
5319
5320
5321
5322
5323 }
5324
5325 if (len <= tcp_header_len) {
5326
5327 if (len == tcp_header_len) {
5328
5329
5330
5331
5332 if (tcp_header_len ==
5333 (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
5334 tp->rcv_nxt == tp->rcv_wup)
5335 tcp_store_ts_recent(tp);
5336
5337
5338
5339
5340 tcp_ack(sk, skb, 0);
5341 __kfree_skb(skb);
5342 tcp_data_snd_check(sk);
5343 return;
5344 } else {
5345 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
5346 goto discard;
5347 }
5348 } else {
5349 int eaten = 0;
5350 bool fragstolen = false;
5351
5352 if (tp->ucopy.task == current &&
5353 tp->copied_seq == tp->rcv_nxt &&
5354 len - tcp_header_len <= tp->ucopy.len &&
5355 sock_owned_by_user(sk)) {
5356 __set_current_state(TASK_RUNNING);
5357
5358 if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
5359
5360
5361
5362
5363 if (tcp_header_len ==
5364 (sizeof(struct tcphdr) +
5365 TCPOLEN_TSTAMP_ALIGNED) &&
5366 tp->rcv_nxt == tp->rcv_wup)
5367 tcp_store_ts_recent(tp);
5368
5369 tcp_rcv_rtt_measure_ts(sk, skb);
5370
5371 __skb_pull(skb, tcp_header_len);
5372 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
5373 NET_INC_STATS(sock_net(sk),
5374 LINUX_MIB_TCPHPHITSTOUSER);
5375 eaten = 1;
5376 }
5377 }
5378 if (!eaten) {
5379 if (tcp_checksum_complete(skb))
5380 goto csum_error;
5381
5382 if ((int)skb->truesize > sk->sk_forward_alloc)
5383 goto step5;
5384
5385
5386
5387
5388
5389 if (tcp_header_len ==
5390 (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
5391 tp->rcv_nxt == tp->rcv_wup)
5392 tcp_store_ts_recent(tp);
5393
5394 tcp_rcv_rtt_measure_ts(sk, skb);
5395
5396 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
5397
5398
5399 eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
5400 &fragstolen);
5401 }
5402
5403 tcp_event_data_recv(sk, skb);
5404
5405 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
5406
5407 tcp_ack(sk, skb, FLAG_DATA);
5408 tcp_data_snd_check(sk);
5409 if (!inet_csk_ack_scheduled(sk))
5410 goto no_ack;
5411 }
5412
5413 __tcp_ack_snd_check(sk, 0);
5414no_ack:
5415 if (eaten)
5416 kfree_skb_partial(skb, fragstolen);
5417 sk->sk_data_ready(sk);
5418 return;
5419 }
5420 }
5421
5422slow_path:
5423 if (len < (th->doff << 2) || tcp_checksum_complete(skb))
5424 goto csum_error;
5425
5426 if (!th->ack && !th->rst && !th->syn)
5427 goto discard;
5428
5429
5430
5431
5432
5433 if (!tcp_validate_incoming(sk, skb, th, 1))
5434 return;
5435
5436step5:
5437 if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
5438 goto discard;
5439
5440 tcp_rcv_rtt_measure_ts(sk, skb);
5441
5442
5443 tcp_urg(sk, skb, th);
5444
5445
5446 tcp_data_queue(sk, skb);
5447
5448 tcp_data_snd_check(sk);
5449 tcp_ack_snd_check(sk);
5450 return;
5451
5452csum_error:
5453 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
5454 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
5455
5456discard:
5457 tcp_drop(sk, skb);
5458}
5459EXPORT_SYMBOL(tcp_rcv_established);
5460
5461void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5462{
5463 struct tcp_sock *tp = tcp_sk(sk);
5464 struct inet_connection_sock *icsk = inet_csk(sk);
5465
5466 tcp_set_state(sk, TCP_ESTABLISHED);
5467
5468 if (skb) {
5469 icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
5470 security_inet_conn_established(sk, skb);
5471 }
5472
5473
5474 icsk->icsk_af_ops->rebuild_header(sk);
5475
5476 tcp_init_metrics(sk);
5477
5478 tcp_init_congestion_control(sk);
5479
5480
5481
5482
5483 tp->lsndtime = tcp_time_stamp;
5484
5485 tcp_init_buffer_space(sk);
5486
5487 if (sock_flag(sk, SOCK_KEEPOPEN))
5488 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
5489
5490 if (!tp->rx_opt.snd_wscale)
5491 __tcp_fast_path_on(tp, tp->snd_wnd);
5492 else
5493 tp->pred_flags = 0;
5494
5495 if (!sock_flag(sk, SOCK_DEAD)) {
5496 sk->sk_state_change(sk);
5497 sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
5498 }
5499}
5500
5501static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5502 struct tcp_fastopen_cookie *cookie)
5503{
5504 struct tcp_sock *tp = tcp_sk(sk);
5505 struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
5506 u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
5507 bool syn_drop = false;
5508
5509 if (mss == tp->rx_opt.user_mss) {
5510 struct tcp_options_received opt;
5511
5512
5513 tcp_clear_options(&opt);
5514 opt.user_mss = opt.mss_clamp = 0;
5515 tcp_parse_options(synack, &opt, 0, NULL);
5516 mss = opt.mss_clamp;
5517 }
5518
5519 if (!tp->syn_fastopen) {
5520
5521 cookie->len = -1;
5522 } else if (tp->total_retrans) {
5523
5524
5525
5526
5527
5528 syn_drop = (cookie->len < 0 && data);
5529 } else if (cookie->len < 0 && !tp->syn_data) {
5530
5531
5532
5533
5534 try_exp = tp->syn_fastopen_exp ? 2 : 1;
5535 }
5536
5537 tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
5538
5539 if (data) {
5540 tcp_for_write_queue_from(data, sk) {
5541 if (data == tcp_send_head(sk) ||
5542 __tcp_retransmit_skb(sk, data, 1))
5543 break;
5544 }
5545 tcp_rearm_rto(sk);
5546 NET_INC_STATS(sock_net(sk),
5547 LINUX_MIB_TCPFASTOPENACTIVEFAIL);
5548 return true;
5549 }
5550 tp->syn_data_acked = tp->syn_data;
5551 if (tp->syn_data_acked)
5552 NET_INC_STATS(sock_net(sk),
5553 LINUX_MIB_TCPFASTOPENACTIVE);
5554
5555 tcp_fastopen_add_skb(sk, synack);
5556
5557 return false;
5558}
5559
5560static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5561 const struct tcphdr *th)
5562{
5563 struct inet_connection_sock *icsk = inet_csk(sk);
5564 struct tcp_sock *tp = tcp_sk(sk);
5565 struct tcp_fastopen_cookie foc = { .len = -1 };
5566 int saved_clamp = tp->rx_opt.mss_clamp;
5567
5568 tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
5569 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
5570 tp->rx_opt.rcv_tsecr -= tp->tsoffset;
5571
5572 if (th->ack) {
5573
5574
5575
5576
5577
5578
5579
5580
5581 if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
5582 after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
5583 goto reset_and_undo;
5584
5585 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
5586 !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
5587 tcp_time_stamp)) {
5588 NET_INC_STATS(sock_net(sk),
5589 LINUX_MIB_PAWSACTIVEREJECTED);
5590 goto reset_and_undo;
5591 }
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601 if (th->rst) {
5602 tcp_reset(sk);
5603 goto discard;
5604 }
5605
5606
5607
5608
5609
5610
5611
5612
5613 if (!th->syn)
5614 goto discard_and_undo;
5615
5616
5617
5618
5619
5620
5621
5622
5623 tcp_ecn_rcv_synack(tp, th);
5624
5625 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5626 tcp_ack(sk, skb, FLAG_SLOWPATH);
5627
5628
5629
5630
5631 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
5632 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
5633
5634
5635
5636
5637 tp->snd_wnd = ntohs(th->window);
5638
5639 if (!tp->rx_opt.wscale_ok) {
5640 tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
5641 tp->window_clamp = min(tp->window_clamp, 65535U);
5642 }
5643
5644 if (tp->rx_opt.saw_tstamp) {
5645 tp->rx_opt.tstamp_ok = 1;
5646 tp->tcp_header_len =
5647 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
5648 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
5649 tcp_store_ts_recent(tp);
5650 } else {
5651 tp->tcp_header_len = sizeof(struct tcphdr);
5652 }
5653
5654 if (tcp_is_sack(tp) && sysctl_tcp_fack)
5655 tcp_enable_fack(tp);
5656
5657 tcp_mtup_init(sk);
5658 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
5659 tcp_initialize_rcv_mss(sk);
5660
5661
5662
5663
5664 tp->copied_seq = tp->rcv_nxt;
5665
5666 smp_mb();
5667
5668 tcp_finish_connect(sk, skb);
5669
5670 if ((tp->syn_fastopen || tp->syn_data) &&
5671 tcp_rcv_fastopen_synack(sk, skb, &foc))
5672 return -1;
5673
5674 if (sk->sk_write_pending ||
5675 icsk->icsk_accept_queue.rskq_defer_accept ||
5676 icsk->icsk_ack.pingpong) {
5677
5678
5679
5680
5681
5682
5683
5684 inet_csk_schedule_ack(sk);
5685 icsk->icsk_ack.lrcvtime = tcp_time_stamp;
5686 tcp_enter_quickack_mode(sk);
5687 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
5688 TCP_DELACK_MAX, TCP_RTO_MAX);
5689
5690discard:
5691 tcp_drop(sk, skb);
5692 return 0;
5693 } else {
5694 tcp_send_ack(sk);
5695 }
5696 return -1;
5697 }
5698
5699
5700
5701 if (th->rst) {
5702
5703
5704
5705
5706
5707
5708 goto discard_and_undo;
5709 }
5710
5711
5712 if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
5713 tcp_paws_reject(&tp->rx_opt, 0))
5714 goto discard_and_undo;
5715
5716 if (th->syn) {
5717
5718
5719
5720
5721 tcp_set_state(sk, TCP_SYN_RECV);
5722
5723 if (tp->rx_opt.saw_tstamp) {
5724 tp->rx_opt.tstamp_ok = 1;
5725 tcp_store_ts_recent(tp);
5726 tp->tcp_header_len =
5727 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
5728 } else {
5729 tp->tcp_header_len = sizeof(struct tcphdr);
5730 }
5731
5732 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
5733 tp->copied_seq = tp->rcv_nxt;
5734 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
5735
5736
5737
5738
5739 tp->snd_wnd = ntohs(th->window);
5740 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
5741 tp->max_window = tp->snd_wnd;
5742
5743 tcp_ecn_rcv_syn(tp, th);
5744
5745 tcp_mtup_init(sk);
5746 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
5747 tcp_initialize_rcv_mss(sk);
5748
5749 tcp_send_synack(sk);
5750#if 0
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762 return -1;
5763#else
5764 goto discard;
5765#endif
5766 }
5767
5768
5769
5770
5771discard_and_undo:
5772 tcp_clear_options(&tp->rx_opt);
5773 tp->rx_opt.mss_clamp = saved_clamp;
5774 goto discard;
5775
5776reset_and_undo:
5777 tcp_clear_options(&tp->rx_opt);
5778 tp->rx_opt.mss_clamp = saved_clamp;
5779 return 1;
5780}
5781
5782
5783
5784
5785
5786
5787
5788
5789int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
5790{
5791 struct tcp_sock *tp = tcp_sk(sk);
5792 struct inet_connection_sock *icsk = inet_csk(sk);
5793 const struct tcphdr *th = tcp_hdr(skb);
5794 struct request_sock *req;
5795 int queued = 0;
5796 bool acceptable;
5797
5798 switch (sk->sk_state) {
5799 case TCP_CLOSE:
5800 goto discard;
5801
5802 case TCP_LISTEN:
5803 if (th->ack)
5804 return 1;
5805
5806 if (th->rst)
5807 goto discard;
5808
5809 if (th->syn) {
5810 if (th->fin)
5811 goto discard;
5812 if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
5813 return 1;
5814
5815 consume_skb(skb);
5816 return 0;
5817 }
5818 goto discard;
5819
5820 case TCP_SYN_SENT:
5821 tp->rx_opt.saw_tstamp = 0;
5822 queued = tcp_rcv_synsent_state_process(sk, skb, th);
5823 if (queued >= 0)
5824 return queued;
5825
5826
5827 tcp_urg(sk, skb, th);
5828 __kfree_skb(skb);
5829 tcp_data_snd_check(sk);
5830 return 0;
5831 }
5832
5833 tp->rx_opt.saw_tstamp = 0;
5834 req = tp->fastopen_rsk;
5835 if (req) {
5836 WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
5837 sk->sk_state != TCP_FIN_WAIT1);
5838
5839 if (!tcp_check_req(sk, skb, req, true))
5840 goto discard;
5841 }
5842
5843 if (!th->ack && !th->rst && !th->syn)
5844 goto discard;
5845
5846 if (!tcp_validate_incoming(sk, skb, th, 0))
5847 return 0;
5848
5849
5850 acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
5851 FLAG_UPDATE_TS_RECENT) > 0;
5852
5853 switch (sk->sk_state) {
5854 case TCP_SYN_RECV:
5855 if (!acceptable)
5856 return 1;
5857
5858 if (!tp->srtt_us)
5859 tcp_synack_rtt_meas(sk, req);
5860
5861
5862
5863
5864 if (req) {
5865 tp->total_retrans = req->num_retrans;
5866 reqsk_fastopen_remove(sk, req, false);
5867 } else {
5868
5869 icsk->icsk_af_ops->rebuild_header(sk);
5870 tcp_init_congestion_control(sk);
5871
5872 tcp_mtup_init(sk);
5873 tp->copied_seq = tp->rcv_nxt;
5874 tcp_init_buffer_space(sk);
5875 }
5876 smp_mb();
5877 tcp_set_state(sk, TCP_ESTABLISHED);
5878 sk->sk_state_change(sk);
5879
5880
5881
5882
5883
5884 if (sk->sk_socket)
5885 sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
5886
5887 tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
5888 tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
5889 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5890
5891 if (tp->rx_opt.tstamp_ok)
5892 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
5893
5894 if (req) {
5895
5896
5897
5898
5899
5900
5901
5902
5903 tcp_rearm_rto(sk);
5904 } else
5905 tcp_init_metrics(sk);
5906
5907 tcp_update_pacing_rate(sk);
5908
5909
5910 tp->lsndtime = tcp_time_stamp;
5911
5912 tcp_initialize_rcv_mss(sk);
5913 tcp_fast_path_on(tp);
5914 break;
5915
5916 case TCP_FIN_WAIT1: {
5917 struct dst_entry *dst;
5918 int tmo;
5919
5920
5921
5922
5923
5924
5925 if (req) {
5926
5927
5928
5929
5930
5931
5932 if (!acceptable)
5933 return 1;
5934
5935 reqsk_fastopen_remove(sk, req, false);
5936 tcp_rearm_rto(sk);
5937 }
5938 if (tp->snd_una != tp->write_seq)
5939 break;
5940
5941 tcp_set_state(sk, TCP_FIN_WAIT2);
5942 sk->sk_shutdown |= SEND_SHUTDOWN;
5943
5944 dst = __sk_dst_get(sk);
5945 if (dst)
5946 dst_confirm(dst);
5947
5948 if (!sock_flag(sk, SOCK_DEAD)) {
5949
5950 sk->sk_state_change(sk);
5951 break;
5952 }
5953
5954 if (tp->linger2 < 0 ||
5955 (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
5956 after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
5957 tcp_done(sk);
5958 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
5959 return 1;
5960 }
5961
5962 tmo = tcp_fin_time(sk);
5963 if (tmo > TCP_TIMEWAIT_LEN) {
5964 inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
5965 } else if (th->fin || sock_owned_by_user(sk)) {
5966
5967
5968
5969
5970
5971
5972 inet_csk_reset_keepalive_timer(sk, tmo);
5973 } else {
5974 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
5975 goto discard;
5976 }
5977 break;
5978 }
5979
5980 case TCP_CLOSING:
5981 if (tp->snd_una == tp->write_seq) {
5982 tcp_time_wait(sk, TCP_TIME_WAIT, 0);
5983 goto discard;
5984 }
5985 break;
5986
5987 case TCP_LAST_ACK:
5988 if (tp->snd_una == tp->write_seq) {
5989 tcp_update_metrics(sk);
5990 tcp_done(sk);
5991 goto discard;
5992 }
5993 break;
5994 }
5995
5996
5997 tcp_urg(sk, skb, th);
5998
5999
6000 switch (sk->sk_state) {
6001 case TCP_CLOSE_WAIT:
6002 case TCP_CLOSING:
6003 case TCP_LAST_ACK:
6004 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
6005 break;
6006 case TCP_FIN_WAIT1:
6007 case TCP_FIN_WAIT2:
6008
6009
6010
6011
6012 if (sk->sk_shutdown & RCV_SHUTDOWN) {
6013 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
6014 after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
6015 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
6016 tcp_reset(sk);
6017 return 1;
6018 }
6019 }
6020
6021 case TCP_ESTABLISHED:
6022 tcp_data_queue(sk, skb);
6023 queued = 1;
6024 break;
6025 }
6026
6027
6028 if (sk->sk_state != TCP_CLOSE) {
6029 tcp_data_snd_check(sk);
6030 tcp_ack_snd_check(sk);
6031 }
6032
6033 if (!queued) {
6034discard:
6035 tcp_drop(sk, skb);
6036 }
6037 return 0;
6038}
6039EXPORT_SYMBOL(tcp_rcv_state_process);
6040
6041static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
6042{
6043 struct inet_request_sock *ireq = inet_rsk(req);
6044
6045 if (family == AF_INET)
6046 net_dbg_ratelimited("drop open request from %pI4/%u\n",
6047 &ireq->ir_rmt_addr, port);
6048#if IS_ENABLED(CONFIG_IPV6)
6049 else if (family == AF_INET6)
6050 net_dbg_ratelimited("drop open request from %pI6/%u\n",
6051 &ireq->ir_v6_rmt_addr, port);
6052#endif
6053}
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067static void tcp_ecn_create_request(struct request_sock *req,
6068 const struct sk_buff *skb,
6069 const struct sock *listen_sk,
6070 const struct dst_entry *dst)
6071{
6072 const struct tcphdr *th = tcp_hdr(skb);
6073 const struct net *net = sock_net(listen_sk);
6074 bool th_ecn = th->ece && th->cwr;
6075 bool ect, ecn_ok;
6076 u32 ecn_ok_dst;
6077
6078 if (!th_ecn)
6079 return;
6080
6081 ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
6082 ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
6083 ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
6084
6085 if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
6086 (ecn_ok_dst & DST_FEATURE_ECN_CA))
6087 inet_rsk(req)->ecn_ok = 1;
6088}
6089
6090static void tcp_openreq_init(struct request_sock *req,
6091 const struct tcp_options_received *rx_opt,
6092 struct sk_buff *skb, const struct sock *sk)
6093{
6094 struct inet_request_sock *ireq = inet_rsk(req);
6095
6096 req->rsk_rcv_wnd = 0;
6097 req->cookie_ts = 0;
6098 tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
6099 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
6100 skb_mstamp_get(&tcp_rsk(req)->snt_synack);
6101 tcp_rsk(req)->last_oow_ack_time = 0;
6102 req->mss = rx_opt->mss_clamp;
6103 req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
6104 ireq->tstamp_ok = rx_opt->tstamp_ok;
6105 ireq->sack_ok = rx_opt->sack_ok;
6106 ireq->snd_wscale = rx_opt->snd_wscale;
6107 ireq->wscale_ok = rx_opt->wscale_ok;
6108 ireq->acked = 0;
6109 ireq->ecn_ok = 0;
6110 ireq->ir_rmt_port = tcp_hdr(skb)->source;
6111 ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
6112 ireq->ir_mark = inet_request_mark(sk, skb);
6113}
6114
6115struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
6116 struct sock *sk_listener,
6117 bool attach_listener)
6118{
6119 struct request_sock *req = reqsk_alloc(ops, sk_listener,
6120 attach_listener);
6121
6122 if (req) {
6123 struct inet_request_sock *ireq = inet_rsk(req);
6124
6125 kmemcheck_annotate_bitfield(ireq, flags);
6126 ireq->opt = NULL;
6127 atomic64_set(&ireq->ir_cookie, 0);
6128 ireq->ireq_state = TCP_NEW_SYN_RECV;
6129 write_pnet(&ireq->ireq_net, sock_net(sk_listener));
6130 ireq->ireq_family = sk_listener->sk_family;
6131 }
6132
6133 return req;
6134}
6135EXPORT_SYMBOL(inet_reqsk_alloc);
6136
6137
6138
6139
6140static bool tcp_syn_flood_action(const struct sock *sk,
6141 const struct sk_buff *skb,
6142 const char *proto)
6143{
6144 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
6145 const char *msg = "Dropping request";
6146 bool want_cookie = false;
6147 struct net *net = sock_net(sk);
6148
6149#ifdef CONFIG_SYN_COOKIES
6150 if (net->ipv4.sysctl_tcp_syncookies) {
6151 msg = "Sending cookies";
6152 want_cookie = true;
6153 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
6154 } else
6155#endif
6156 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
6157
6158 if (!queue->synflood_warned &&
6159 net->ipv4.sysctl_tcp_syncookies != 2 &&
6160 xchg(&queue->synflood_warned, 1) == 0)
6161 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
6162 proto, ntohs(tcp_hdr(skb)->dest), msg);
6163
6164 return want_cookie;
6165}
6166
6167static void tcp_reqsk_record_syn(const struct sock *sk,
6168 struct request_sock *req,
6169 const struct sk_buff *skb)
6170{
6171 if (tcp_sk(sk)->save_syn) {
6172 u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
6173 u32 *copy;
6174
6175 copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
6176 if (copy) {
6177 copy[0] = len;
6178 memcpy(©[1], skb_network_header(skb), len);
6179 req->saved_syn = copy;
6180 }
6181 }
6182}
6183
6184int tcp_conn_request(struct request_sock_ops *rsk_ops,
6185 const struct tcp_request_sock_ops *af_ops,
6186 struct sock *sk, struct sk_buff *skb)
6187{
6188 struct tcp_fastopen_cookie foc = { .len = -1 };
6189 __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
6190 struct tcp_options_received tmp_opt;
6191 struct tcp_sock *tp = tcp_sk(sk);
6192 struct net *net = sock_net(sk);
6193 struct sock *fastopen_sk = NULL;
6194 struct dst_entry *dst = NULL;
6195 struct request_sock *req;
6196 bool want_cookie = false;
6197 struct flowi fl;
6198
6199
6200
6201
6202
6203 if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
6204 inet_csk_reqsk_queue_is_full(sk)) && !isn) {
6205 want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
6206 if (!want_cookie)
6207 goto drop;
6208 }
6209
6210
6211
6212
6213
6214
6215
6216 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
6217 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
6218 goto drop;
6219 }
6220
6221 req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
6222 if (!req)
6223 goto drop;
6224
6225 tcp_rsk(req)->af_specific = af_ops;
6226
6227 tcp_clear_options(&tmp_opt);
6228 tmp_opt.mss_clamp = af_ops->mss_clamp;
6229 tmp_opt.user_mss = tp->rx_opt.user_mss;
6230 tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
6231
6232 if (want_cookie && !tmp_opt.saw_tstamp)
6233 tcp_clear_options(&tmp_opt);
6234
6235 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
6236 tcp_openreq_init(req, &tmp_opt, skb, sk);
6237
6238
6239 inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
6240
6241 af_ops->init_req(req, sk, skb);
6242
6243 if (security_inet_conn_request(sk, skb, req))
6244 goto drop_and_free;
6245
6246 if (!want_cookie && !isn) {
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256 if (tcp_death_row.sysctl_tw_recycle) {
6257 bool strict;
6258
6259 dst = af_ops->route_req(sk, &fl, req, &strict);
6260
6261 if (dst && strict &&
6262 !tcp_peer_is_proven(req, dst, true,
6263 tmp_opt.saw_tstamp)) {
6264 NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
6265 goto drop_and_release;
6266 }
6267 }
6268
6269 else if (!net->ipv4.sysctl_tcp_syncookies &&
6270 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
6271 (sysctl_max_syn_backlog >> 2)) &&
6272 !tcp_peer_is_proven(req, dst, false,
6273 tmp_opt.saw_tstamp)) {
6274
6275
6276
6277
6278
6279
6280
6281 pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
6282 rsk_ops->family);
6283 goto drop_and_release;
6284 }
6285
6286 isn = af_ops->init_seq(skb);
6287 }
6288 if (!dst) {
6289 dst = af_ops->route_req(sk, &fl, req, NULL);
6290 if (!dst)
6291 goto drop_and_free;
6292 }
6293
6294 tcp_ecn_create_request(req, skb, sk, dst);
6295
6296 if (want_cookie) {
6297 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
6298 req->cookie_ts = tmp_opt.tstamp_ok;
6299 if (!tmp_opt.tstamp_ok)
6300 inet_rsk(req)->ecn_ok = 0;
6301 }
6302
6303 tcp_rsk(req)->snt_isn = isn;
6304 tcp_rsk(req)->txhash = net_tx_rndhash();
6305 tcp_openreq_init_rwin(req, sk, dst);
6306 if (!want_cookie) {
6307 tcp_reqsk_record_syn(sk, req, skb);
6308 fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
6309 }
6310 if (fastopen_sk) {
6311 af_ops->send_synack(fastopen_sk, dst, &fl, req,
6312 &foc, TCP_SYNACK_FASTOPEN);
6313
6314 inet_csk_reqsk_queue_add(sk, req, fastopen_sk);
6315 sk->sk_data_ready(sk);
6316 bh_unlock_sock(fastopen_sk);
6317 sock_put(fastopen_sk);
6318 } else {
6319 tcp_rsk(req)->tfo_listener = false;
6320 if (!want_cookie)
6321 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
6322 af_ops->send_synack(sk, dst, &fl, req, &foc,
6323 !want_cookie ? TCP_SYNACK_NORMAL :
6324 TCP_SYNACK_COOKIE);
6325 if (want_cookie) {
6326 reqsk_free(req);
6327 return 0;
6328 }
6329 }
6330 reqsk_put(req);
6331 return 0;
6332
6333drop_and_release:
6334 dst_release(dst);
6335drop_and_free:
6336 reqsk_free(req);
6337drop:
6338 tcp_listendrop(sk);
6339 return 0;
6340}
6341EXPORT_SYMBOL(tcp_conn_request);
6342