1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65#define pr_fmt(fmt) "TCP: " fmt
66
67#include <linux/mm.h>
68#include <linux/slab.h>
69#include <linux/module.h>
70#include <linux/sysctl.h>
71#include <linux/kernel.h>
72#include <linux/prefetch.h>
73#include <net/dst.h>
74#include <net/tcp.h>
75#include <net/inet_common.h>
76#include <linux/ipsec.h>
77#include <asm/unaligned.h>
78#include <linux/errqueue.h>
79
80int sysctl_tcp_fack __read_mostly;
81int sysctl_tcp_max_reordering __read_mostly = 300;
82int sysctl_tcp_dsack __read_mostly = 1;
83int sysctl_tcp_app_win __read_mostly = 31;
84int sysctl_tcp_adv_win_scale __read_mostly = 1;
85EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
86
87
88int sysctl_tcp_challenge_ack_limit = 1000;
89
90int sysctl_tcp_stdurg __read_mostly;
91int sysctl_tcp_rfc1337 __read_mostly;
92int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
93int sysctl_tcp_frto __read_mostly = 2;
94int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
95int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
96int sysctl_tcp_early_retrans __read_mostly = 3;
97int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
98
99#define FLAG_DATA 0x01
100#define FLAG_WIN_UPDATE 0x02
101#define FLAG_DATA_ACKED 0x04
102#define FLAG_RETRANS_DATA_ACKED 0x08
103#define FLAG_SYN_ACKED 0x10
104#define FLAG_DATA_SACKED 0x20
105#define FLAG_ECE 0x40
106#define FLAG_LOST_RETRANS 0x80
107#define FLAG_SLOWPATH 0x100
108#define FLAG_ORIG_SACK_ACKED 0x200
109#define FLAG_SND_UNA_ADVANCED 0x400
110#define FLAG_DSACKING_ACK 0x800
111#define FLAG_SET_XMIT_TIMER 0x1000
112#define FLAG_SACK_RENEGING 0x2000
113#define FLAG_UPDATE_TS_RECENT 0x4000
114#define FLAG_NO_CHALLENGE_ACK 0x8000
115
116#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
117#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
118#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK)
119#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
120
121#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
122#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
123
124#define REXMIT_NONE 0
125#define REXMIT_LOST 1
126#define REXMIT_NEW 2
127
128static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
129 unsigned int len)
130{
131 static bool __once __read_mostly;
132
133 if (!__once) {
134 struct net_device *dev;
135
136 __once = true;
137
138 rcu_read_lock();
139 dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
140 if (!dev || len >= dev->mtu)
141 pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
142 dev ? dev->name : "Unknown driver");
143 rcu_read_unlock();
144 }
145}
146
147
148
149
150static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
151{
152 struct inet_connection_sock *icsk = inet_csk(sk);
153 const unsigned int lss = icsk->icsk_ack.last_seg_size;
154 unsigned int len;
155
156 icsk->icsk_ack.last_seg_size = 0;
157
158
159
160
161 len = skb_shinfo(skb)->gso_size ? : skb->len;
162 if (len >= icsk->icsk_ack.rcv_mss) {
163 icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
164 tcp_sk(sk)->advmss);
165
166 if (unlikely(len > icsk->icsk_ack.rcv_mss +
167 MAX_TCP_OPTION_SPACE))
168 tcp_gro_dev_warn(sk, skb, len);
169 } else {
170
171
172
173
174
175 len += skb->data - skb_transport_header(skb);
176 if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
177
178
179
180
181
182 (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
183 !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
184
185
186
187
188 len -= tcp_sk(sk)->tcp_header_len;
189 icsk->icsk_ack.last_seg_size = len;
190 if (len == lss) {
191 icsk->icsk_ack.rcv_mss = len;
192 return;
193 }
194 }
195 if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
196 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
197 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
198 }
199}
200
201static void tcp_incr_quickack(struct sock *sk)
202{
203 struct inet_connection_sock *icsk = inet_csk(sk);
204 unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
205
206 if (quickacks == 0)
207 quickacks = 2;
208 if (quickacks > icsk->icsk_ack.quick)
209 icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
210}
211
212static void tcp_enter_quickack_mode(struct sock *sk)
213{
214 struct inet_connection_sock *icsk = inet_csk(sk);
215 tcp_incr_quickack(sk);
216 icsk->icsk_ack.pingpong = 0;
217 icsk->icsk_ack.ato = TCP_ATO_MIN;
218}
219
220
221
222
223
224static bool tcp_in_quickack_mode(struct sock *sk)
225{
226 const struct inet_connection_sock *icsk = inet_csk(sk);
227 const struct dst_entry *dst = __sk_dst_get(sk);
228
229 return (dst && dst_metric(dst, RTAX_QUICKACK)) ||
230 (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);
231}
232
233static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
234{
235 if (tp->ecn_flags & TCP_ECN_OK)
236 tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
237}
238
239static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
240{
241 if (tcp_hdr(skb)->cwr)
242 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
243}
244
245static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
246{
247 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
248}
249
250static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
251{
252 switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
253 case INET_ECN_NOT_ECT:
254
255
256
257
258 if (tp->ecn_flags & TCP_ECN_SEEN)
259 tcp_enter_quickack_mode((struct sock *)tp);
260 break;
261 case INET_ECN_CE:
262 if (tcp_ca_needs_ecn((struct sock *)tp))
263 tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
264
265 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
266
267 tcp_enter_quickack_mode((struct sock *)tp);
268 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
269 }
270 tp->ecn_flags |= TCP_ECN_SEEN;
271 break;
272 default:
273 if (tcp_ca_needs_ecn((struct sock *)tp))
274 tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
275 tp->ecn_flags |= TCP_ECN_SEEN;
276 break;
277 }
278}
279
280static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
281{
282 if (tp->ecn_flags & TCP_ECN_OK)
283 __tcp_ecn_check_ce(tp, skb);
284}
285
286static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
287{
288 if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
289 tp->ecn_flags &= ~TCP_ECN_OK;
290}
291
292static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
293{
294 if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
295 tp->ecn_flags &= ~TCP_ECN_OK;
296}
297
298static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
299{
300 if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
301 return true;
302 return false;
303}
304
305
306
307
308
309
310static void tcp_sndbuf_expand(struct sock *sk)
311{
312 const struct tcp_sock *tp = tcp_sk(sk);
313 const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
314 int sndmem, per_mss;
315 u32 nr_segs;
316
317
318
319
320 per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
321 MAX_TCP_HEADER +
322 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
323
324 per_mss = roundup_pow_of_two(per_mss) +
325 SKB_DATA_ALIGN(sizeof(struct sk_buff));
326
327 nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
328 nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
329
330
331
332
333
334 sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
335 sndmem *= nr_segs * per_mss;
336
337 if (sk->sk_sndbuf < sndmem)
338 sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
339}
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
368{
369 struct tcp_sock *tp = tcp_sk(sk);
370
371 int truesize = tcp_win_from_space(skb->truesize) >> 1;
372 int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1;
373
374 while (tp->rcv_ssthresh <= window) {
375 if (truesize <= skb->len)
376 return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
377
378 truesize >>= 1;
379 window >>= 1;
380 }
381 return 0;
382}
383
384static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
385{
386 struct tcp_sock *tp = tcp_sk(sk);
387
388
389 if (tp->rcv_ssthresh < tp->window_clamp &&
390 (int)tp->rcv_ssthresh < tcp_space(sk) &&
391 !tcp_under_memory_pressure(sk)) {
392 int incr;
393
394
395
396
397 if (tcp_win_from_space(skb->truesize) <= skb->len)
398 incr = 2 * tp->advmss;
399 else
400 incr = __tcp_grow_window(sk, skb);
401
402 if (incr) {
403 incr = max_t(int, incr, 2 * skb->len);
404 tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
405 tp->window_clamp);
406 inet_csk(sk)->icsk_ack.quick |= 1;
407 }
408 }
409}
410
411
412static void tcp_fixup_rcvbuf(struct sock *sk)
413{
414 u32 mss = tcp_sk(sk)->advmss;
415 int rcvmem;
416
417 rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
418 tcp_default_init_rwnd(mss);
419
420
421
422
423 if (sysctl_tcp_moderate_rcvbuf)
424 rcvmem <<= 2;
425
426 if (sk->sk_rcvbuf < rcvmem)
427 sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
428}
429
430
431
432
433void tcp_init_buffer_space(struct sock *sk)
434{
435 struct tcp_sock *tp = tcp_sk(sk);
436 int maxwin;
437
438 if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
439 tcp_fixup_rcvbuf(sk);
440 if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
441 tcp_sndbuf_expand(sk);
442
443 tp->rcvq_space.space = tp->rcv_wnd;
444 tcp_mstamp_refresh(tp);
445 tp->rcvq_space.time = tp->tcp_mstamp;
446 tp->rcvq_space.seq = tp->copied_seq;
447
448 maxwin = tcp_full_space(sk);
449
450 if (tp->window_clamp >= maxwin) {
451 tp->window_clamp = maxwin;
452
453 if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss)
454 tp->window_clamp = max(maxwin -
455 (maxwin >> sysctl_tcp_app_win),
456 4 * tp->advmss);
457 }
458
459
460 if (sysctl_tcp_app_win &&
461 tp->window_clamp > 2 * tp->advmss &&
462 tp->window_clamp + tp->advmss > maxwin)
463 tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
464
465 tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
466 tp->snd_cwnd_stamp = tcp_jiffies32;
467}
468
469
470static void tcp_clamp_window(struct sock *sk)
471{
472 struct tcp_sock *tp = tcp_sk(sk);
473 struct inet_connection_sock *icsk = inet_csk(sk);
474
475 icsk->icsk_ack.quick = 0;
476
477 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
478 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
479 !tcp_under_memory_pressure(sk) &&
480 sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
481 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
482 sysctl_tcp_rmem[2]);
483 }
484 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
485 tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
486}
487
488
489
490
491
492
493
494
495void tcp_initialize_rcv_mss(struct sock *sk)
496{
497 const struct tcp_sock *tp = tcp_sk(sk);
498 unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
499
500 hint = min(hint, tp->rcv_wnd / 2);
501 hint = min(hint, TCP_MSS_DEFAULT);
502 hint = max(hint, TCP_MIN_MSS);
503
504 inet_csk(sk)->icsk_ack.rcv_mss = hint;
505}
506EXPORT_SYMBOL(tcp_initialize_rcv_mss);
507
508
509
510
511
512
513
514
515
516
517
518
519static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
520{
521 u32 new_sample = tp->rcv_rtt_est.rtt_us;
522 long m = sample;
523
524 if (m == 0)
525 m = 1;
526
527 if (new_sample != 0) {
528
529
530
531
532
533
534
535
536
537
538 if (!win_dep) {
539 m -= (new_sample >> 3);
540 new_sample += m;
541 } else {
542 m <<= 3;
543 if (m < new_sample)
544 new_sample = m;
545 }
546 } else {
547
548 new_sample = m << 3;
549 }
550
551 tp->rcv_rtt_est.rtt_us = new_sample;
552}
553
554static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
555{
556 u32 delta_us;
557
558 if (tp->rcv_rtt_est.time == 0)
559 goto new_measure;
560 if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
561 return;
562 delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time);
563 tcp_rcv_rtt_update(tp, delta_us, 1);
564
565new_measure:
566 tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
567 tp->rcv_rtt_est.time = tp->tcp_mstamp;
568}
569
570static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
571 const struct sk_buff *skb)
572{
573 struct tcp_sock *tp = tcp_sk(sk);
574
575 if (tp->rx_opt.rcv_tsecr &&
576 (TCP_SKB_CB(skb)->end_seq -
577 TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) {
578 u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
579 u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
580
581 tcp_rcv_rtt_update(tp, delta_us, 0);
582 }
583}
584
585
586
587
588
589void tcp_rcv_space_adjust(struct sock *sk)
590{
591 struct tcp_sock *tp = tcp_sk(sk);
592 int time;
593 int copied;
594
595 time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
596 if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
597 return;
598
599
600 copied = tp->copied_seq - tp->rcvq_space.seq;
601 if (copied <= tp->rcvq_space.space)
602 goto new_measure;
603
604
605
606
607
608
609
610
611
612
613 if (sysctl_tcp_moderate_rcvbuf &&
614 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
615 int rcvwin, rcvmem, rcvbuf;
616
617
618
619
620 rcvwin = (copied << 1) + 16 * tp->advmss;
621
622
623
624
625
626
627 if (copied >=
628 tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
629 if (copied >=
630 tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
631 rcvwin <<= 1;
632 else
633 rcvwin += (rcvwin >> 1);
634 }
635
636 rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
637 while (tcp_win_from_space(rcvmem) < tp->advmss)
638 rcvmem += 128;
639
640 rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
641 if (rcvbuf > sk->sk_rcvbuf) {
642 sk->sk_rcvbuf = rcvbuf;
643
644
645 tp->window_clamp = rcvwin;
646 }
647 }
648 tp->rcvq_space.space = copied;
649
650new_measure:
651 tp->rcvq_space.seq = tp->copied_seq;
652 tp->rcvq_space.time = tp->tcp_mstamp;
653}
654
655
656
657
658
659
660
661
662
663
664
665static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
666{
667 struct tcp_sock *tp = tcp_sk(sk);
668 struct inet_connection_sock *icsk = inet_csk(sk);
669 u32 now;
670
671 inet_csk_schedule_ack(sk);
672
673 tcp_measure_rcv_mss(sk, skb);
674
675 tcp_rcv_rtt_measure(tp);
676
677 now = tcp_jiffies32;
678
679 if (!icsk->icsk_ack.ato) {
680
681
682
683 tcp_incr_quickack(sk);
684 icsk->icsk_ack.ato = TCP_ATO_MIN;
685 } else {
686 int m = now - icsk->icsk_ack.lrcvtime;
687
688 if (m <= TCP_ATO_MIN / 2) {
689
690 icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
691 } else if (m < icsk->icsk_ack.ato) {
692 icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
693 if (icsk->icsk_ack.ato > icsk->icsk_rto)
694 icsk->icsk_ack.ato = icsk->icsk_rto;
695 } else if (m > icsk->icsk_rto) {
696
697
698
699 tcp_incr_quickack(sk);
700 sk_mem_reclaim(sk);
701 }
702 }
703 icsk->icsk_ack.lrcvtime = now;
704
705 tcp_ecn_check_ce(tp, skb);
706
707 if (skb->len >= 128)
708 tcp_grow_window(sk, skb);
709}
710
711
712
713
714
715
716
717
718
719
720static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
721{
722 struct tcp_sock *tp = tcp_sk(sk);
723 long m = mrtt_us;
724 u32 srtt = tp->srtt_us;
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742 if (srtt != 0) {
743 m -= (srtt >> 3);
744 srtt += m;
745 if (m < 0) {
746 m = -m;
747 m -= (tp->mdev_us >> 2);
748
749
750
751
752
753
754
755
756 if (m > 0)
757 m >>= 3;
758 } else {
759 m -= (tp->mdev_us >> 2);
760 }
761 tp->mdev_us += m;
762 if (tp->mdev_us > tp->mdev_max_us) {
763 tp->mdev_max_us = tp->mdev_us;
764 if (tp->mdev_max_us > tp->rttvar_us)
765 tp->rttvar_us = tp->mdev_max_us;
766 }
767 if (after(tp->snd_una, tp->rtt_seq)) {
768 if (tp->mdev_max_us < tp->rttvar_us)
769 tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
770 tp->rtt_seq = tp->snd_nxt;
771 tp->mdev_max_us = tcp_rto_min_us(sk);
772 }
773 } else {
774
775 srtt = m << 3;
776 tp->mdev_us = m << 1;
777 tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
778 tp->mdev_max_us = tp->rttvar_us;
779 tp->rtt_seq = tp->snd_nxt;
780 }
781 tp->srtt_us = max(1U, srtt);
782}
783
784
785
786
787
788
789
790int sysctl_tcp_pacing_ss_ratio __read_mostly = 200;
791int sysctl_tcp_pacing_ca_ratio __read_mostly = 120;
792
793static void tcp_update_pacing_rate(struct sock *sk)
794{
795 const struct tcp_sock *tp = tcp_sk(sk);
796 u64 rate;
797
798
799 rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
800
801
802
803
804
805
806
807
808
809 if (tp->snd_cwnd < tp->snd_ssthresh / 2)
810 rate *= sysctl_tcp_pacing_ss_ratio;
811 else
812 rate *= sysctl_tcp_pacing_ca_ratio;
813
814 rate *= max(tp->snd_cwnd, tp->packets_out);
815
816 if (likely(tp->srtt_us))
817 do_div(rate, tp->srtt_us);
818
819
820
821
822
823 ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate,
824 sk->sk_max_pacing_rate);
825}
826
827
828
829
830static void tcp_set_rto(struct sock *sk)
831{
832 const struct tcp_sock *tp = tcp_sk(sk);
833
834
835
836
837
838
839
840
841
842
843 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
844
845
846
847
848
849
850
851
852
853
854 tcp_bound_rto(sk);
855}
856
857__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
858{
859 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
860
861 if (!cwnd)
862 cwnd = TCP_INIT_CWND;
863 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
864}
865
866
867
868
869
870void tcp_disable_fack(struct tcp_sock *tp)
871{
872
873 if (tcp_is_fack(tp))
874 tp->lost_skb_hint = NULL;
875 tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED;
876}
877
878
879static void tcp_dsack_seen(struct tcp_sock *tp)
880{
881 tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
882}
883
884static void tcp_update_reordering(struct sock *sk, const int metric,
885 const int ts)
886{
887 struct tcp_sock *tp = tcp_sk(sk);
888 int mib_idx;
889
890 if (WARN_ON_ONCE(metric < 0))
891 return;
892
893 if (metric > tp->reordering) {
894 tp->reordering = min(sysctl_tcp_max_reordering, metric);
895
896#if FASTRETRANS_DEBUG > 1
897 pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
898 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
899 tp->reordering,
900 tp->fackets_out,
901 tp->sacked_out,
902 tp->undo_marker ? tp->undo_retrans : 0);
903#endif
904 tcp_disable_fack(tp);
905 }
906
907 tp->rack.reord = 1;
908
909
910 if (ts)
911 mib_idx = LINUX_MIB_TCPTSREORDER;
912 else if (tcp_is_reno(tp))
913 mib_idx = LINUX_MIB_TCPRENOREORDER;
914 else if (tcp_is_fack(tp))
915 mib_idx = LINUX_MIB_TCPFACKREORDER;
916 else
917 mib_idx = LINUX_MIB_TCPSACKREORDER;
918
919 NET_INC_STATS(sock_net(sk), mib_idx);
920}
921
922
923static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
924{
925 if (!tp->retransmit_skb_hint ||
926 before(TCP_SKB_CB(skb)->seq,
927 TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
928 tp->retransmit_skb_hint = skb;
929}
930
931
932
933
934
935
936
937
938static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb)
939{
940 __u8 sacked = TCP_SKB_CB(skb)->sacked;
941
942 if (!(sacked & TCPCB_LOST) ||
943 ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
944 tp->lost += tcp_skb_pcount(skb);
945}
946
947static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
948{
949 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
950 tcp_verify_retransmit_hint(tp, skb);
951
952 tp->lost_out += tcp_skb_pcount(skb);
953 tcp_sum_lost(tp, skb);
954 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
955 }
956}
957
958void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
959{
960 tcp_verify_retransmit_hint(tp, skb);
961
962 tcp_sum_lost(tp, skb);
963 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
964 tp->lost_out += tcp_skb_pcount(skb);
965 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
966 }
967}
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
1064 u32 start_seq, u32 end_seq)
1065{
1066
1067 if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
1068 return false;
1069
1070
1071 if (!before(start_seq, tp->snd_nxt))
1072 return false;
1073
1074
1075
1076
1077 if (after(start_seq, tp->snd_una))
1078 return true;
1079
1080 if (!is_dsack || !tp->undo_marker)
1081 return false;
1082
1083
1084 if (after(end_seq, tp->snd_una))
1085 return false;
1086
1087 if (!before(start_seq, tp->undo_marker))
1088 return true;
1089
1090
1091 if (!after(end_seq, tp->undo_marker))
1092 return false;
1093
1094
1095
1096
1097 return !before(start_seq, end_seq - tp->max_window);
1098}
1099
1100static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
1101 struct tcp_sack_block_wire *sp, int num_sacks,
1102 u32 prior_snd_una)
1103{
1104 struct tcp_sock *tp = tcp_sk(sk);
1105 u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
1106 u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
1107 bool dup_sack = false;
1108
1109 if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
1110 dup_sack = true;
1111 tcp_dsack_seen(tp);
1112 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
1113 } else if (num_sacks > 1) {
1114 u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
1115 u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
1116
1117 if (!after(end_seq_0, end_seq_1) &&
1118 !before(start_seq_0, start_seq_1)) {
1119 dup_sack = true;
1120 tcp_dsack_seen(tp);
1121 NET_INC_STATS(sock_net(sk),
1122 LINUX_MIB_TCPDSACKOFORECV);
1123 }
1124 }
1125
1126
1127 if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
1128 !after(end_seq_0, prior_snd_una) &&
1129 after(end_seq_0, tp->undo_marker))
1130 tp->undo_retrans--;
1131
1132 return dup_sack;
1133}
1134
1135struct tcp_sacktag_state {
1136 int reord;
1137 int fack_count;
1138
1139
1140
1141
1142 u64 first_sackt;
1143 u64 last_sackt;
1144 struct rate_sample *rate;
1145 int flag;
1146};
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1157 u32 start_seq, u32 end_seq)
1158{
1159 int err;
1160 bool in_sack;
1161 unsigned int pkt_len;
1162 unsigned int mss;
1163
1164 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1165 !before(end_seq, TCP_SKB_CB(skb)->end_seq);
1166
1167 if (tcp_skb_pcount(skb) > 1 && !in_sack &&
1168 after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
1169 mss = tcp_skb_mss(skb);
1170 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1171
1172 if (!in_sack) {
1173 pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
1174 if (pkt_len < mss)
1175 pkt_len = mss;
1176 } else {
1177 pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
1178 if (pkt_len < mss)
1179 return -EINVAL;
1180 }
1181
1182
1183
1184
1185 if (pkt_len > mss) {
1186 unsigned int new_len = (pkt_len / mss) * mss;
1187 if (!in_sack && new_len < pkt_len)
1188 new_len += mss;
1189 pkt_len = new_len;
1190 }
1191
1192 if (pkt_len >= skb->len && !in_sack)
1193 return 0;
1194
1195 err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
1196 if (err < 0)
1197 return err;
1198 }
1199
1200 return in_sack;
1201}
1202
1203
1204static u8 tcp_sacktag_one(struct sock *sk,
1205 struct tcp_sacktag_state *state, u8 sacked,
1206 u32 start_seq, u32 end_seq,
1207 int dup_sack, int pcount,
1208 u64 xmit_time)
1209{
1210 struct tcp_sock *tp = tcp_sk(sk);
1211 int fack_count = state->fack_count;
1212
1213
1214 if (dup_sack && (sacked & TCPCB_RETRANS)) {
1215 if (tp->undo_marker && tp->undo_retrans > 0 &&
1216 after(end_seq, tp->undo_marker))
1217 tp->undo_retrans--;
1218 if (sacked & TCPCB_SACKED_ACKED)
1219 state->reord = min(fack_count, state->reord);
1220 }
1221
1222
1223 if (!after(end_seq, tp->snd_una))
1224 return sacked;
1225
1226 if (!(sacked & TCPCB_SACKED_ACKED)) {
1227 tcp_rack_advance(tp, sacked, end_seq, xmit_time);
1228
1229 if (sacked & TCPCB_SACKED_RETRANS) {
1230
1231
1232
1233
1234 if (sacked & TCPCB_LOST) {
1235 sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
1236 tp->lost_out -= pcount;
1237 tp->retrans_out -= pcount;
1238 }
1239 } else {
1240 if (!(sacked & TCPCB_RETRANS)) {
1241
1242
1243
1244 if (before(start_seq,
1245 tcp_highest_sack_seq(tp)))
1246 state->reord = min(fack_count,
1247 state->reord);
1248 if (!after(end_seq, tp->high_seq))
1249 state->flag |= FLAG_ORIG_SACK_ACKED;
1250 if (state->first_sackt == 0)
1251 state->first_sackt = xmit_time;
1252 state->last_sackt = xmit_time;
1253 }
1254
1255 if (sacked & TCPCB_LOST) {
1256 sacked &= ~TCPCB_LOST;
1257 tp->lost_out -= pcount;
1258 }
1259 }
1260
1261 sacked |= TCPCB_SACKED_ACKED;
1262 state->flag |= FLAG_DATA_SACKED;
1263 tp->sacked_out += pcount;
1264 tp->delivered += pcount;
1265
1266 fack_count += pcount;
1267
1268
1269 if (!tcp_is_fack(tp) && tp->lost_skb_hint &&
1270 before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
1271 tp->lost_cnt_hint += pcount;
1272
1273 if (fack_count > tp->fackets_out)
1274 tp->fackets_out = fack_count;
1275 }
1276
1277
1278
1279
1280
1281 if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
1282 sacked &= ~TCPCB_SACKED_RETRANS;
1283 tp->retrans_out -= pcount;
1284 }
1285
1286 return sacked;
1287}
1288
1289
1290
1291
1292static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1293 struct tcp_sacktag_state *state,
1294 unsigned int pcount, int shifted, int mss,
1295 bool dup_sack)
1296{
1297 struct tcp_sock *tp = tcp_sk(sk);
1298 struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
1299 u32 start_seq = TCP_SKB_CB(skb)->seq;
1300 u32 end_seq = start_seq + shifted;
1301
1302 BUG_ON(!pcount);
1303
1304
1305
1306
1307
1308
1309
1310 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
1311 start_seq, end_seq, dup_sack, pcount,
1312 skb->skb_mstamp);
1313 tcp_rate_skb_delivered(sk, skb, state->rate);
1314
1315 if (skb == tp->lost_skb_hint)
1316 tp->lost_cnt_hint += pcount;
1317
1318 TCP_SKB_CB(prev)->end_seq += shifted;
1319 TCP_SKB_CB(skb)->seq += shifted;
1320
1321 tcp_skb_pcount_add(prev, pcount);
1322 BUG_ON(tcp_skb_pcount(skb) < pcount);
1323 tcp_skb_pcount_add(skb, -pcount);
1324
1325
1326
1327
1328
1329
1330 if (!TCP_SKB_CB(prev)->tcp_gso_size)
1331 TCP_SKB_CB(prev)->tcp_gso_size = mss;
1332
1333
1334 if (tcp_skb_pcount(skb) <= 1)
1335 TCP_SKB_CB(skb)->tcp_gso_size = 0;
1336
1337
1338 TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
1339
1340 if (skb->len > 0) {
1341 BUG_ON(!tcp_skb_pcount(skb));
1342 NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED);
1343 return false;
1344 }
1345
1346
1347
1348 if (skb == tp->retransmit_skb_hint)
1349 tp->retransmit_skb_hint = prev;
1350 if (skb == tp->lost_skb_hint) {
1351 tp->lost_skb_hint = prev;
1352 tp->lost_cnt_hint -= tcp_skb_pcount(prev);
1353 }
1354
1355 TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1356 TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
1357 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1358 TCP_SKB_CB(prev)->end_seq++;
1359
1360 if (skb == tcp_highest_sack(sk))
1361 tcp_advance_highest_sack(sk, skb);
1362
1363 tcp_skb_collapse_tstamp(prev, skb);
1364 if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
1365 TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;
1366
1367 tcp_unlink_write_queue(skb, sk);
1368 sk_wmem_free_skb(sk, skb);
1369
1370 NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
1371
1372 return true;
1373}
1374
1375
1376
1377
1378static int tcp_skb_seglen(const struct sk_buff *skb)
1379{
1380 return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
1381}
1382
1383
1384static int skb_can_shift(const struct sk_buff *skb)
1385{
1386 return !skb_headlen(skb) && skb_is_nonlinear(skb);
1387}
1388
1389
1390
1391
1392static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1393 struct tcp_sacktag_state *state,
1394 u32 start_seq, u32 end_seq,
1395 bool dup_sack)
1396{
1397 struct tcp_sock *tp = tcp_sk(sk);
1398 struct sk_buff *prev;
1399 int mss;
1400 int pcount = 0;
1401 int len;
1402 int in_sack;
1403
1404 if (!sk_can_gso(sk))
1405 goto fallback;
1406
1407
1408 if (!dup_sack &&
1409 (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
1410 goto fallback;
1411 if (!skb_can_shift(skb))
1412 goto fallback;
1413
1414 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1415 goto fallback;
1416
1417
1418 if (unlikely(skb == tcp_write_queue_head(sk)))
1419 goto fallback;
1420 prev = tcp_write_queue_prev(sk, skb);
1421
1422 if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
1423 goto fallback;
1424
1425 if (!tcp_skb_can_collapse_to(prev))
1426 goto fallback;
1427
1428 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1429 !before(end_seq, TCP_SKB_CB(skb)->end_seq);
1430
1431 if (in_sack) {
1432 len = skb->len;
1433 pcount = tcp_skb_pcount(skb);
1434 mss = tcp_skb_seglen(skb);
1435
1436
1437
1438
1439 if (mss != tcp_skb_seglen(prev))
1440 goto fallback;
1441 } else {
1442 if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
1443 goto noop;
1444
1445
1446
1447
1448 if (tcp_skb_pcount(skb) <= 1)
1449 goto noop;
1450
1451 in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1452 if (!in_sack) {
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464 goto fallback;
1465 }
1466
1467 len = end_seq - TCP_SKB_CB(skb)->seq;
1468 BUG_ON(len < 0);
1469 BUG_ON(len > skb->len);
1470
1471
1472
1473
1474
1475 mss = tcp_skb_mss(skb);
1476
1477
1478
1479
1480 if (mss != tcp_skb_seglen(prev))
1481 goto fallback;
1482
1483 if (len == mss) {
1484 pcount = 1;
1485 } else if (len < mss) {
1486 goto noop;
1487 } else {
1488 pcount = len / mss;
1489 len = pcount * mss;
1490 }
1491 }
1492
1493
1494 if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
1495 goto fallback;
1496
1497 if (!skb_shift(prev, skb, len))
1498 goto fallback;
1499 if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
1500 goto out;
1501
1502
1503
1504
1505 if (prev == tcp_write_queue_tail(sk))
1506 goto out;
1507 skb = tcp_write_queue_next(sk, prev);
1508
1509 if (!skb_can_shift(skb) ||
1510 (skb == tcp_send_head(sk)) ||
1511 ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
1512 (mss != tcp_skb_seglen(skb)))
1513 goto out;
1514
1515 len = skb->len;
1516 if (skb_shift(prev, skb, len)) {
1517 pcount += tcp_skb_pcount(skb);
1518 tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0);
1519 }
1520
1521out:
1522 state->fack_count += pcount;
1523 return prev;
1524
1525noop:
1526 return skb;
1527
1528fallback:
1529 NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
1530 return NULL;
1531}
1532
1533static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1534 struct tcp_sack_block *next_dup,
1535 struct tcp_sacktag_state *state,
1536 u32 start_seq, u32 end_seq,
1537 bool dup_sack_in)
1538{
1539 struct tcp_sock *tp = tcp_sk(sk);
1540 struct sk_buff *tmp;
1541
1542 tcp_for_write_queue_from(skb, sk) {
1543 int in_sack = 0;
1544 bool dup_sack = dup_sack_in;
1545
1546 if (skb == tcp_send_head(sk))
1547 break;
1548
1549
1550 if (!before(TCP_SKB_CB(skb)->seq, end_seq))
1551 break;
1552
1553 if (next_dup &&
1554 before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
1555 in_sack = tcp_match_skb_to_sack(sk, skb,
1556 next_dup->start_seq,
1557 next_dup->end_seq);
1558 if (in_sack > 0)
1559 dup_sack = true;
1560 }
1561
1562
1563
1564
1565
1566 if (in_sack <= 0) {
1567 tmp = tcp_shift_skb_data(sk, skb, state,
1568 start_seq, end_seq, dup_sack);
1569 if (tmp) {
1570 if (tmp != skb) {
1571 skb = tmp;
1572 continue;
1573 }
1574
1575 in_sack = 0;
1576 } else {
1577 in_sack = tcp_match_skb_to_sack(sk, skb,
1578 start_seq,
1579 end_seq);
1580 }
1581 }
1582
1583 if (unlikely(in_sack < 0))
1584 break;
1585
1586 if (in_sack) {
1587 TCP_SKB_CB(skb)->sacked =
1588 tcp_sacktag_one(sk,
1589 state,
1590 TCP_SKB_CB(skb)->sacked,
1591 TCP_SKB_CB(skb)->seq,
1592 TCP_SKB_CB(skb)->end_seq,
1593 dup_sack,
1594 tcp_skb_pcount(skb),
1595 skb->skb_mstamp);
1596 tcp_rate_skb_delivered(sk, skb, state->rate);
1597
1598 if (!before(TCP_SKB_CB(skb)->seq,
1599 tcp_highest_sack_seq(tp)))
1600 tcp_advance_highest_sack(sk, skb);
1601 }
1602
1603 state->fack_count += tcp_skb_pcount(skb);
1604 }
1605 return skb;
1606}
1607
1608
1609
1610
1611static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
1612 struct tcp_sacktag_state *state,
1613 u32 skip_to_seq)
1614{
1615 tcp_for_write_queue_from(skb, sk) {
1616 if (skb == tcp_send_head(sk))
1617 break;
1618
1619 if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
1620 break;
1621
1622 state->fack_count += tcp_skb_pcount(skb);
1623 }
1624 return skb;
1625}
1626
1627static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
1628 struct sock *sk,
1629 struct tcp_sack_block *next_dup,
1630 struct tcp_sacktag_state *state,
1631 u32 skip_to_seq)
1632{
1633 if (!next_dup)
1634 return skb;
1635
1636 if (before(next_dup->start_seq, skip_to_seq)) {
1637 skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq);
1638 skb = tcp_sacktag_walk(skb, sk, NULL, state,
1639 next_dup->start_seq, next_dup->end_seq,
1640 1);
1641 }
1642
1643 return skb;
1644}
1645
1646static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache)
1647{
1648 return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
1649}
1650
1651static int
1652tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1653 u32 prior_snd_una, struct tcp_sacktag_state *state)
1654{
1655 struct tcp_sock *tp = tcp_sk(sk);
1656 const unsigned char *ptr = (skb_transport_header(ack_skb) +
1657 TCP_SKB_CB(ack_skb)->sacked);
1658 struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
1659 struct tcp_sack_block sp[TCP_NUM_SACKS];
1660 struct tcp_sack_block *cache;
1661 struct sk_buff *skb;
1662 int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
1663 int used_sacks;
1664 bool found_dup_sack = false;
1665 int i, j;
1666 int first_sack_index;
1667
1668 state->flag = 0;
1669 state->reord = tp->packets_out;
1670
1671 if (!tp->sacked_out) {
1672 if (WARN_ON(tp->fackets_out))
1673 tp->fackets_out = 0;
1674 tcp_highest_sack_reset(sk);
1675 }
1676
1677 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
1678 num_sacks, prior_snd_una);
1679 if (found_dup_sack) {
1680 state->flag |= FLAG_DSACKING_ACK;
1681 tp->delivered++;
1682 }
1683
1684
1685
1686
1687
1688 if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
1689 return 0;
1690
1691 if (!tp->packets_out)
1692 goto out;
1693
1694 used_sacks = 0;
1695 first_sack_index = 0;
1696 for (i = 0; i < num_sacks; i++) {
1697 bool dup_sack = !i && found_dup_sack;
1698
1699 sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
1700 sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
1701
1702 if (!tcp_is_sackblock_valid(tp, dup_sack,
1703 sp[used_sacks].start_seq,
1704 sp[used_sacks].end_seq)) {
1705 int mib_idx;
1706
1707 if (dup_sack) {
1708 if (!tp->undo_marker)
1709 mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
1710 else
1711 mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
1712 } else {
1713
1714 if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
1715 !after(sp[used_sacks].end_seq, tp->snd_una))
1716 continue;
1717 mib_idx = LINUX_MIB_TCPSACKDISCARD;
1718 }
1719
1720 NET_INC_STATS(sock_net(sk), mib_idx);
1721 if (i == 0)
1722 first_sack_index = -1;
1723 continue;
1724 }
1725
1726
1727 if (!after(sp[used_sacks].end_seq, prior_snd_una))
1728 continue;
1729
1730 used_sacks++;
1731 }
1732
1733
1734 for (i = used_sacks - 1; i > 0; i--) {
1735 for (j = 0; j < i; j++) {
1736 if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
1737 swap(sp[j], sp[j + 1]);
1738
1739
1740 if (j == first_sack_index)
1741 first_sack_index = j + 1;
1742 }
1743 }
1744 }
1745
1746 skb = tcp_write_queue_head(sk);
1747 state->fack_count = 0;
1748 i = 0;
1749
1750 if (!tp->sacked_out) {
1751
1752 cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
1753 } else {
1754 cache = tp->recv_sack_cache;
1755
1756 while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
1757 !cache->end_seq)
1758 cache++;
1759 }
1760
1761 while (i < used_sacks) {
1762 u32 start_seq = sp[i].start_seq;
1763 u32 end_seq = sp[i].end_seq;
1764 bool dup_sack = (found_dup_sack && (i == first_sack_index));
1765 struct tcp_sack_block *next_dup = NULL;
1766
1767 if (found_dup_sack && ((i + 1) == first_sack_index))
1768 next_dup = &sp[i + 1];
1769
1770
1771 while (tcp_sack_cache_ok(tp, cache) &&
1772 !before(start_seq, cache->end_seq))
1773 cache++;
1774
1775
1776 if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
1777 after(end_seq, cache->start_seq)) {
1778
1779
1780 if (before(start_seq, cache->start_seq)) {
1781 skb = tcp_sacktag_skip(skb, sk, state,
1782 start_seq);
1783 skb = tcp_sacktag_walk(skb, sk, next_dup,
1784 state,
1785 start_seq,
1786 cache->start_seq,
1787 dup_sack);
1788 }
1789
1790
1791 if (!after(end_seq, cache->end_seq))
1792 goto advance_sp;
1793
1794 skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
1795 state,
1796 cache->end_seq);
1797
1798
1799 if (tcp_highest_sack_seq(tp) == cache->end_seq) {
1800
1801 skb = tcp_highest_sack(sk);
1802 if (!skb)
1803 break;
1804 state->fack_count = tp->fackets_out;
1805 cache++;
1806 goto walk;
1807 }
1808
1809 skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq);
1810
1811 cache++;
1812 continue;
1813 }
1814
1815 if (!before(start_seq, tcp_highest_sack_seq(tp))) {
1816 skb = tcp_highest_sack(sk);
1817 if (!skb)
1818 break;
1819 state->fack_count = tp->fackets_out;
1820 }
1821 skb = tcp_sacktag_skip(skb, sk, state, start_seq);
1822
1823walk:
1824 skb = tcp_sacktag_walk(skb, sk, next_dup, state,
1825 start_seq, end_seq, dup_sack);
1826
1827advance_sp:
1828 i++;
1829 }
1830
1831
1832 for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
1833 tp->recv_sack_cache[i].start_seq = 0;
1834 tp->recv_sack_cache[i].end_seq = 0;
1835 }
1836 for (j = 0; j < used_sacks; j++)
1837 tp->recv_sack_cache[i++] = sp[j];
1838
1839 if ((state->reord < tp->fackets_out) &&
1840 ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
1841 tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
1842
1843 tcp_verify_left_out(tp);
1844out:
1845
1846#if FASTRETRANS_DEBUG > 0
1847 WARN_ON((int)tp->sacked_out < 0);
1848 WARN_ON((int)tp->lost_out < 0);
1849 WARN_ON((int)tp->retrans_out < 0);
1850 WARN_ON((int)tcp_packets_in_flight(tp) < 0);
1851#endif
1852 return state->flag;
1853}
1854
1855
1856
1857
1858static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
1859{
1860 u32 holes;
1861
1862 holes = max(tp->lost_out, 1U);
1863 holes = min(holes, tp->packets_out);
1864
1865 if ((tp->sacked_out + holes) > tp->packets_out) {
1866 tp->sacked_out = tp->packets_out - holes;
1867 return true;
1868 }
1869 return false;
1870}
1871
1872
1873
1874
1875
1876static void tcp_check_reno_reordering(struct sock *sk, const int addend)
1877{
1878 struct tcp_sock *tp = tcp_sk(sk);
1879 if (tcp_limit_reno_sacked(tp))
1880 tcp_update_reordering(sk, tp->packets_out + addend, 0);
1881}
1882
1883
1884
1885static void tcp_add_reno_sack(struct sock *sk)
1886{
1887 struct tcp_sock *tp = tcp_sk(sk);
1888 u32 prior_sacked = tp->sacked_out;
1889
1890 tp->sacked_out++;
1891 tcp_check_reno_reordering(sk, 0);
1892 if (tp->sacked_out > prior_sacked)
1893 tp->delivered++;
1894 tcp_verify_left_out(tp);
1895}
1896
1897
1898
1899static void tcp_remove_reno_sacks(struct sock *sk, int acked)
1900{
1901 struct tcp_sock *tp = tcp_sk(sk);
1902
1903 if (acked > 0) {
1904
1905 tp->delivered += max_t(int, acked - tp->sacked_out, 1);
1906 if (acked - 1 >= tp->sacked_out)
1907 tp->sacked_out = 0;
1908 else
1909 tp->sacked_out -= acked - 1;
1910 }
1911 tcp_check_reno_reordering(sk, acked);
1912 tcp_verify_left_out(tp);
1913}
1914
1915static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
1916{
1917 tp->sacked_out = 0;
1918}
1919
1920void tcp_clear_retrans(struct tcp_sock *tp)
1921{
1922 tp->retrans_out = 0;
1923 tp->lost_out = 0;
1924 tp->undo_marker = 0;
1925 tp->undo_retrans = -1;
1926 tp->fackets_out = 0;
1927 tp->sacked_out = 0;
1928}
1929
1930static inline void tcp_init_undo(struct tcp_sock *tp)
1931{
1932 tp->undo_marker = tp->snd_una;
1933
1934 tp->undo_retrans = tp->retrans_out ? : -1;
1935}
1936
1937
1938
1939
1940
1941void tcp_enter_loss(struct sock *sk)
1942{
1943 const struct inet_connection_sock *icsk = inet_csk(sk);
1944 struct tcp_sock *tp = tcp_sk(sk);
1945 struct net *net = sock_net(sk);
1946 struct sk_buff *skb;
1947 bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
1948 bool is_reneg;
1949 bool mark_lost;
1950
1951
1952 if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
1953 !after(tp->high_seq, tp->snd_una) ||
1954 (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
1955 tp->prior_ssthresh = tcp_current_ssthresh(sk);
1956 tp->prior_cwnd = tp->snd_cwnd;
1957 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1958 tcp_ca_event(sk, CA_EVENT_LOSS);
1959 tcp_init_undo(tp);
1960 }
1961 tp->snd_cwnd = 1;
1962 tp->snd_cwnd_cnt = 0;
1963 tp->snd_cwnd_stamp = tcp_jiffies32;
1964
1965 tp->retrans_out = 0;
1966 tp->lost_out = 0;
1967
1968 if (tcp_is_reno(tp))
1969 tcp_reset_reno_sack(tp);
1970
1971 skb = tcp_write_queue_head(sk);
1972 is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
1973 if (is_reneg) {
1974 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
1975 tp->sacked_out = 0;
1976 tp->fackets_out = 0;
1977 }
1978 tcp_clear_all_retrans_hints(tp);
1979
1980 tcp_for_write_queue(skb, sk) {
1981 if (skb == tcp_send_head(sk))
1982 break;
1983
1984 mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
1985 is_reneg);
1986 if (mark_lost)
1987 tcp_sum_lost(tp, skb);
1988 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
1989 if (mark_lost) {
1990 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
1991 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1992 tp->lost_out += tcp_skb_pcount(skb);
1993 }
1994 }
1995 tcp_verify_left_out(tp);
1996
1997
1998
1999
2000 if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
2001 tp->sacked_out >= net->ipv4.sysctl_tcp_reordering)
2002 tp->reordering = min_t(unsigned int, tp->reordering,
2003 net->ipv4.sysctl_tcp_reordering);
2004 tcp_set_ca_state(sk, TCP_CA_Loss);
2005 tp->high_seq = tp->snd_nxt;
2006 tcp_ecn_queue_cwr(tp);
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017 tp->frto = sysctl_tcp_frto &&
2018 (new_recovery || icsk->icsk_retransmits) &&
2019 !inet_csk(sk)->icsk_mtup.probe_size;
2020}
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032static bool tcp_check_sack_reneging(struct sock *sk, int flag)
2033{
2034 if (flag & FLAG_SACK_RENEGING) {
2035 struct tcp_sock *tp = tcp_sk(sk);
2036 unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
2037 msecs_to_jiffies(10));
2038
2039 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2040 delay, TCP_RTO_MAX);
2041 return true;
2042 }
2043 return false;
2044}
2045
2046static inline int tcp_fackets_out(const struct tcp_sock *tp)
2047{
2048 return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
2049}
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
2067{
2068 return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
2069}
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178static bool tcp_time_to_recover(struct sock *sk, int flag)
2179{
2180 struct tcp_sock *tp = tcp_sk(sk);
2181
2182
2183 if (tp->lost_out)
2184 return true;
2185
2186
2187 if (tcp_dupack_heuristics(tp) > tp->reordering)
2188 return true;
2189
2190 return false;
2191}
2192
2193
2194
2195
2196
2197
2198
2199static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2200{
2201 struct tcp_sock *tp = tcp_sk(sk);
2202 struct sk_buff *skb;
2203 int cnt, oldcnt, lost;
2204 unsigned int mss;
2205
2206 const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
2207
2208 WARN_ON(packets > tp->packets_out);
2209 if (tp->lost_skb_hint) {
2210 skb = tp->lost_skb_hint;
2211 cnt = tp->lost_cnt_hint;
2212
2213 if (mark_head && skb != tcp_write_queue_head(sk))
2214 return;
2215 } else {
2216 skb = tcp_write_queue_head(sk);
2217 cnt = 0;
2218 }
2219
2220 tcp_for_write_queue_from(skb, sk) {
2221 if (skb == tcp_send_head(sk))
2222 break;
2223
2224
2225 tp->lost_skb_hint = skb;
2226 tp->lost_cnt_hint = cnt;
2227
2228 if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
2229 break;
2230
2231 oldcnt = cnt;
2232 if (tcp_is_fack(tp) || tcp_is_reno(tp) ||
2233 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
2234 cnt += tcp_skb_pcount(skb);
2235
2236 if (cnt > packets) {
2237 if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
2238 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
2239 (oldcnt >= packets))
2240 break;
2241
2242 mss = tcp_skb_mss(skb);
2243
2244 lost = (packets - oldcnt) * mss;
2245 if (lost < skb->len &&
2246 tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0)
2247 break;
2248 cnt = packets;
2249 }
2250
2251 tcp_skb_mark_lost(tp, skb);
2252
2253 if (mark_head)
2254 break;
2255 }
2256 tcp_verify_left_out(tp);
2257}
2258
2259
2260
2261static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
2262{
2263 struct tcp_sock *tp = tcp_sk(sk);
2264
2265 if (tcp_is_reno(tp)) {
2266 tcp_mark_head_lost(sk, 1, 1);
2267 } else if (tcp_is_fack(tp)) {
2268 int lost = tp->fackets_out - tp->reordering;
2269 if (lost <= 0)
2270 lost = 1;
2271 tcp_mark_head_lost(sk, lost, 0);
2272 } else {
2273 int sacked_upto = tp->sacked_out - tp->reordering;
2274 if (sacked_upto >= 0)
2275 tcp_mark_head_lost(sk, sacked_upto, 0);
2276 else if (fast_rexmit)
2277 tcp_mark_head_lost(sk, 1, 1);
2278 }
2279}
2280
2281static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
2282{
2283 return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
2284 before(tp->rx_opt.rcv_tsecr, when);
2285}
2286
2287
2288
2289
2290static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
2291 const struct sk_buff *skb)
2292{
2293 return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
2294 tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
2295}
2296
2297
2298
2299
2300static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
2301{
2302 return !tp->retrans_stamp ||
2303 tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
2304}
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322static bool tcp_any_retrans_done(const struct sock *sk)
2323{
2324 const struct tcp_sock *tp = tcp_sk(sk);
2325 struct sk_buff *skb;
2326
2327 if (tp->retrans_out)
2328 return true;
2329
2330 skb = tcp_write_queue_head(sk);
2331 if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
2332 return true;
2333
2334 return false;
2335}
2336
2337#if FASTRETRANS_DEBUG > 1
2338static void DBGUNDO(struct sock *sk, const char *msg)
2339{
2340 struct tcp_sock *tp = tcp_sk(sk);
2341 struct inet_sock *inet = inet_sk(sk);
2342
2343 if (sk->sk_family == AF_INET) {
2344 pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
2345 msg,
2346 &inet->inet_daddr, ntohs(inet->inet_dport),
2347 tp->snd_cwnd, tcp_left_out(tp),
2348 tp->snd_ssthresh, tp->prior_ssthresh,
2349 tp->packets_out);
2350 }
2351#if IS_ENABLED(CONFIG_IPV6)
2352 else if (sk->sk_family == AF_INET6) {
2353 pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
2354 msg,
2355 &sk->sk_v6_daddr, ntohs(inet->inet_dport),
2356 tp->snd_cwnd, tcp_left_out(tp),
2357 tp->snd_ssthresh, tp->prior_ssthresh,
2358 tp->packets_out);
2359 }
2360#endif
2361}
2362#else
2363#define DBGUNDO(x...) do { } while (0)
2364#endif
2365
2366static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
2367{
2368 struct tcp_sock *tp = tcp_sk(sk);
2369
2370 if (unmark_loss) {
2371 struct sk_buff *skb;
2372
2373 tcp_for_write_queue(skb, sk) {
2374 if (skb == tcp_send_head(sk))
2375 break;
2376 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
2377 }
2378 tp->lost_out = 0;
2379 tcp_clear_all_retrans_hints(tp);
2380 }
2381
2382 if (tp->prior_ssthresh) {
2383 const struct inet_connection_sock *icsk = inet_csk(sk);
2384
2385 tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
2386
2387 if (tp->prior_ssthresh > tp->snd_ssthresh) {
2388 tp->snd_ssthresh = tp->prior_ssthresh;
2389 tcp_ecn_withdraw_cwr(tp);
2390 }
2391 }
2392 tp->snd_cwnd_stamp = tcp_jiffies32;
2393 tp->undo_marker = 0;
2394}
2395
2396static inline bool tcp_may_undo(const struct tcp_sock *tp)
2397{
2398 return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
2399}
2400
2401
2402static bool tcp_try_undo_recovery(struct sock *sk)
2403{
2404 struct tcp_sock *tp = tcp_sk(sk);
2405
2406 if (tcp_may_undo(tp)) {
2407 int mib_idx;
2408
2409
2410
2411
2412 DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
2413 tcp_undo_cwnd_reduction(sk, false);
2414 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
2415 mib_idx = LINUX_MIB_TCPLOSSUNDO;
2416 else
2417 mib_idx = LINUX_MIB_TCPFULLUNDO;
2418
2419 NET_INC_STATS(sock_net(sk), mib_idx);
2420 }
2421 if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
2422
2423
2424
2425 if (!tcp_any_retrans_done(sk))
2426 tp->retrans_stamp = 0;
2427 return true;
2428 }
2429 tcp_set_ca_state(sk, TCP_CA_Open);
2430 return false;
2431}
2432
2433
2434static bool tcp_try_undo_dsack(struct sock *sk)
2435{
2436 struct tcp_sock *tp = tcp_sk(sk);
2437
2438 if (tp->undo_marker && !tp->undo_retrans) {
2439 DBGUNDO(sk, "D-SACK");
2440 tcp_undo_cwnd_reduction(sk, false);
2441 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
2442 return true;
2443 }
2444 return false;
2445}
2446
2447
2448static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
2449{
2450 struct tcp_sock *tp = tcp_sk(sk);
2451
2452 if (frto_undo || tcp_may_undo(tp)) {
2453 tcp_undo_cwnd_reduction(sk, true);
2454
2455 DBGUNDO(sk, "partial loss");
2456 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
2457 if (frto_undo)
2458 NET_INC_STATS(sock_net(sk),
2459 LINUX_MIB_TCPSPURIOUSRTOS);
2460 inet_csk(sk)->icsk_retransmits = 0;
2461 if (frto_undo || tcp_is_sack(tp))
2462 tcp_set_ca_state(sk, TCP_CA_Open);
2463 return true;
2464 }
2465 return false;
2466}
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477static void tcp_init_cwnd_reduction(struct sock *sk)
2478{
2479 struct tcp_sock *tp = tcp_sk(sk);
2480
2481 tp->high_seq = tp->snd_nxt;
2482 tp->tlp_high_seq = 0;
2483 tp->snd_cwnd_cnt = 0;
2484 tp->prior_cwnd = tp->snd_cwnd;
2485 tp->prr_delivered = 0;
2486 tp->prr_out = 0;
2487 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
2488 tcp_ecn_queue_cwr(tp);
2489}
2490
2491void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
2492{
2493 struct tcp_sock *tp = tcp_sk(sk);
2494 int sndcnt = 0;
2495 int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
2496
2497 if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
2498 return;
2499
2500 tp->prr_delivered += newly_acked_sacked;
2501 if (delta < 0) {
2502 u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
2503 tp->prior_cwnd - 1;
2504 sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
2505 } else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
2506 !(flag & FLAG_LOST_RETRANS)) {
2507 sndcnt = min_t(int, delta,
2508 max_t(int, tp->prr_delivered - tp->prr_out,
2509 newly_acked_sacked) + 1);
2510 } else {
2511 sndcnt = min(delta, newly_acked_sacked);
2512 }
2513
2514 sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
2515 tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
2516}
2517
2518static inline void tcp_end_cwnd_reduction(struct sock *sk)
2519{
2520 struct tcp_sock *tp = tcp_sk(sk);
2521
2522 if (inet_csk(sk)->icsk_ca_ops->cong_control)
2523 return;
2524
2525
2526 if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
2527 (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) {
2528 tp->snd_cwnd = tp->snd_ssthresh;
2529 tp->snd_cwnd_stamp = tcp_jiffies32;
2530 }
2531 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
2532}
2533
2534
2535void tcp_enter_cwr(struct sock *sk)
2536{
2537 struct tcp_sock *tp = tcp_sk(sk);
2538
2539 tp->prior_ssthresh = 0;
2540 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2541 tp->undo_marker = 0;
2542 tcp_init_cwnd_reduction(sk);
2543 tcp_set_ca_state(sk, TCP_CA_CWR);
2544 }
2545}
2546EXPORT_SYMBOL(tcp_enter_cwr);
2547
2548static void tcp_try_keep_open(struct sock *sk)
2549{
2550 struct tcp_sock *tp = tcp_sk(sk);
2551 int state = TCP_CA_Open;
2552
2553 if (tcp_left_out(tp) || tcp_any_retrans_done(sk))
2554 state = TCP_CA_Disorder;
2555
2556 if (inet_csk(sk)->icsk_ca_state != state) {
2557 tcp_set_ca_state(sk, state);
2558 tp->high_seq = tp->snd_nxt;
2559 }
2560}
2561
2562static void tcp_try_to_open(struct sock *sk, int flag)
2563{
2564 struct tcp_sock *tp = tcp_sk(sk);
2565
2566 tcp_verify_left_out(tp);
2567
2568 if (!tcp_any_retrans_done(sk))
2569 tp->retrans_stamp = 0;
2570
2571 if (flag & FLAG_ECE)
2572 tcp_enter_cwr(sk);
2573
2574 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
2575 tcp_try_keep_open(sk);
2576 }
2577}
2578
2579static void tcp_mtup_probe_failed(struct sock *sk)
2580{
2581 struct inet_connection_sock *icsk = inet_csk(sk);
2582
2583 icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
2584 icsk->icsk_mtup.probe_size = 0;
2585 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
2586}
2587
2588static void tcp_mtup_probe_success(struct sock *sk)
2589{
2590 struct tcp_sock *tp = tcp_sk(sk);
2591 struct inet_connection_sock *icsk = inet_csk(sk);
2592
2593
2594 tp->prior_ssthresh = tcp_current_ssthresh(sk);
2595 tp->snd_cwnd = tp->snd_cwnd *
2596 tcp_mss_to_mtu(sk, tp->mss_cache) /
2597 icsk->icsk_mtup.probe_size;
2598 tp->snd_cwnd_cnt = 0;
2599 tp->snd_cwnd_stamp = tcp_jiffies32;
2600 tp->snd_ssthresh = tcp_current_ssthresh(sk);
2601
2602 icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
2603 icsk->icsk_mtup.probe_size = 0;
2604 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
2605 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
2606}
2607
2608
2609
2610
2611
2612void tcp_simple_retransmit(struct sock *sk)
2613{
2614 const struct inet_connection_sock *icsk = inet_csk(sk);
2615 struct tcp_sock *tp = tcp_sk(sk);
2616 struct sk_buff *skb;
2617 unsigned int mss = tcp_current_mss(sk);
2618
2619 tcp_for_write_queue(skb, sk) {
2620 if (skb == tcp_send_head(sk))
2621 break;
2622 if (tcp_skb_seglen(skb) > mss &&
2623 !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
2624 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
2625 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
2626 tp->retrans_out -= tcp_skb_pcount(skb);
2627 }
2628 tcp_skb_mark_lost_uncond_verify(tp, skb);
2629 }
2630 }
2631
2632 tcp_clear_retrans_hints_partial(tp);
2633
2634 if (!tp->lost_out)
2635 return;
2636
2637 if (tcp_is_reno(tp))
2638 tcp_limit_reno_sacked(tp);
2639
2640 tcp_verify_left_out(tp);
2641
2642
2643
2644
2645
2646
2647 if (icsk->icsk_ca_state != TCP_CA_Loss) {
2648 tp->high_seq = tp->snd_nxt;
2649 tp->snd_ssthresh = tcp_current_ssthresh(sk);
2650 tp->prior_ssthresh = 0;
2651 tp->undo_marker = 0;
2652 tcp_set_ca_state(sk, TCP_CA_Loss);
2653 }
2654 tcp_xmit_retransmit_queue(sk);
2655}
2656EXPORT_SYMBOL(tcp_simple_retransmit);
2657
2658void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2659{
2660 struct tcp_sock *tp = tcp_sk(sk);
2661 int mib_idx;
2662
2663 if (tcp_is_reno(tp))
2664 mib_idx = LINUX_MIB_TCPRENORECOVERY;
2665 else
2666 mib_idx = LINUX_MIB_TCPSACKRECOVERY;
2667
2668 NET_INC_STATS(sock_net(sk), mib_idx);
2669
2670 tp->prior_ssthresh = 0;
2671 tcp_init_undo(tp);
2672
2673 if (!tcp_in_cwnd_reduction(sk)) {
2674 if (!ece_ack)
2675 tp->prior_ssthresh = tcp_current_ssthresh(sk);
2676 tcp_init_cwnd_reduction(sk);
2677 }
2678 tcp_set_ca_state(sk, TCP_CA_Recovery);
2679}
2680
2681
2682
2683
2684static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
2685 int *rexmit)
2686{
2687 struct tcp_sock *tp = tcp_sk(sk);
2688 bool recovered = !before(tp->snd_una, tp->high_seq);
2689
2690 if ((flag & FLAG_SND_UNA_ADVANCED) &&
2691 tcp_try_undo_loss(sk, false))
2692 return;
2693
2694
2695
2696
2697
2698
2699
2700
2701 if ((flag & FLAG_ORIG_SACK_ACKED) &&
2702 tcp_try_undo_loss(sk, tp->undo_marker))
2703 return;
2704
2705 if (tp->frto) {
2706 if (after(tp->snd_nxt, tp->high_seq)) {
2707 if (flag & FLAG_DATA_SACKED || is_dupack)
2708 tp->frto = 0;
2709 } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
2710 tp->high_seq = tp->snd_nxt;
2711
2712
2713
2714
2715 if (tcp_send_head(sk) &&
2716 after(tcp_wnd_end(tp), tp->snd_nxt)) {
2717 *rexmit = REXMIT_NEW;
2718 return;
2719 }
2720 tp->frto = 0;
2721 }
2722 }
2723
2724 if (recovered) {
2725
2726 tcp_try_undo_recovery(sk);
2727 return;
2728 }
2729 if (tcp_is_reno(tp)) {
2730
2731
2732
2733 if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
2734 tcp_add_reno_sack(sk);
2735 else if (flag & FLAG_SND_UNA_ADVANCED)
2736 tcp_reset_reno_sack(tp);
2737 }
2738 *rexmit = REXMIT_LOST;
2739}
2740
2741
2742static bool tcp_try_undo_partial(struct sock *sk, const int acked)
2743{
2744 struct tcp_sock *tp = tcp_sk(sk);
2745
2746 if (tp->undo_marker && tcp_packet_delayed(tp)) {
2747
2748
2749
2750 tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
2751
2752
2753
2754
2755
2756
2757 if (tp->retrans_out)
2758 return true;
2759
2760 if (!tcp_any_retrans_done(sk))
2761 tp->retrans_stamp = 0;
2762
2763 DBGUNDO(sk, "partial recovery");
2764 tcp_undo_cwnd_reduction(sk, true);
2765 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
2766 tcp_try_keep_open(sk);
2767 return true;
2768 }
2769 return false;
2770}
2771
2772static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
2773{
2774 struct tcp_sock *tp = tcp_sk(sk);
2775
2776
2777 if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
2778 u32 prior_retrans = tp->retrans_out;
2779
2780 tcp_rack_mark_lost(sk);
2781 if (prior_retrans > tp->retrans_out)
2782 *ack_flag |= FLAG_LOST_RETRANS;
2783 }
2784}
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2799 bool is_dupack, int *ack_flag, int *rexmit)
2800{
2801 struct inet_connection_sock *icsk = inet_csk(sk);
2802 struct tcp_sock *tp = tcp_sk(sk);
2803 int fast_rexmit = 0, flag = *ack_flag;
2804 bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
2805 (tcp_fackets_out(tp) > tp->reordering));
2806
2807 if (WARN_ON(!tp->packets_out && tp->sacked_out))
2808 tp->sacked_out = 0;
2809 if (WARN_ON(!tp->sacked_out && tp->fackets_out))
2810 tp->fackets_out = 0;
2811
2812
2813
2814 if (flag & FLAG_ECE)
2815 tp->prior_ssthresh = 0;
2816
2817
2818 if (tcp_check_sack_reneging(sk, flag))
2819 return;
2820
2821
2822 tcp_verify_left_out(tp);
2823
2824
2825
2826 if (icsk->icsk_ca_state == TCP_CA_Open) {
2827 WARN_ON(tp->retrans_out != 0);
2828 tp->retrans_stamp = 0;
2829 } else if (!before(tp->snd_una, tp->high_seq)) {
2830 switch (icsk->icsk_ca_state) {
2831 case TCP_CA_CWR:
2832
2833
2834 if (tp->snd_una != tp->high_seq) {
2835 tcp_end_cwnd_reduction(sk);
2836 tcp_set_ca_state(sk, TCP_CA_Open);
2837 }
2838 break;
2839
2840 case TCP_CA_Recovery:
2841 if (tcp_is_reno(tp))
2842 tcp_reset_reno_sack(tp);
2843 if (tcp_try_undo_recovery(sk))
2844 return;
2845 tcp_end_cwnd_reduction(sk);
2846 break;
2847 }
2848 }
2849
2850
2851 switch (icsk->icsk_ca_state) {
2852 case TCP_CA_Recovery:
2853 if (!(flag & FLAG_SND_UNA_ADVANCED)) {
2854 if (tcp_is_reno(tp) && is_dupack)
2855 tcp_add_reno_sack(sk);
2856 } else {
2857 if (tcp_try_undo_partial(sk, acked))
2858 return;
2859
2860 do_lost = tcp_is_reno(tp) ||
2861 tcp_fackets_out(tp) > tp->reordering;
2862 }
2863 if (tcp_try_undo_dsack(sk)) {
2864 tcp_try_keep_open(sk);
2865 return;
2866 }
2867 tcp_rack_identify_loss(sk, ack_flag);
2868 break;
2869 case TCP_CA_Loss:
2870 tcp_process_loss(sk, flag, is_dupack, rexmit);
2871 tcp_rack_identify_loss(sk, ack_flag);
2872 if (!(icsk->icsk_ca_state == TCP_CA_Open ||
2873 (*ack_flag & FLAG_LOST_RETRANS)))
2874 return;
2875
2876 default:
2877 if (tcp_is_reno(tp)) {
2878 if (flag & FLAG_SND_UNA_ADVANCED)
2879 tcp_reset_reno_sack(tp);
2880 if (is_dupack)
2881 tcp_add_reno_sack(sk);
2882 }
2883
2884 if (icsk->icsk_ca_state <= TCP_CA_Disorder)
2885 tcp_try_undo_dsack(sk);
2886
2887 tcp_rack_identify_loss(sk, ack_flag);
2888 if (!tcp_time_to_recover(sk, flag)) {
2889 tcp_try_to_open(sk, flag);
2890 return;
2891 }
2892
2893
2894 if (icsk->icsk_ca_state < TCP_CA_CWR &&
2895 icsk->icsk_mtup.probe_size &&
2896 tp->snd_una == tp->mtu_probe.probe_seq_start) {
2897 tcp_mtup_probe_failed(sk);
2898
2899 tp->snd_cwnd++;
2900 tcp_simple_retransmit(sk);
2901 return;
2902 }
2903
2904
2905 tcp_enter_recovery(sk, (flag & FLAG_ECE));
2906 fast_rexmit = 1;
2907 }
2908
2909 if (do_lost)
2910 tcp_update_scoreboard(sk, fast_rexmit);
2911 *rexmit = REXMIT_LOST;
2912}
2913
2914static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
2915{
2916 struct tcp_sock *tp = tcp_sk(sk);
2917 u32 wlen = sysctl_tcp_min_rtt_wlen * HZ;
2918
2919 minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32,
2920 rtt_us ? : jiffies_to_usecs(1));
2921}
2922
2923static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2924 long seq_rtt_us, long sack_rtt_us,
2925 long ca_rtt_us, struct rate_sample *rs)
2926{
2927 const struct tcp_sock *tp = tcp_sk(sk);
2928
2929
2930
2931
2932
2933
2934 if (seq_rtt_us < 0)
2935 seq_rtt_us = sack_rtt_us;
2936
2937
2938
2939
2940
2941
2942
2943 if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
2944 flag & FLAG_ACKED) {
2945 u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
2946 u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
2947
2948 seq_rtt_us = ca_rtt_us = delta_us;
2949 }
2950 rs->rtt_us = ca_rtt_us;
2951 if (seq_rtt_us < 0)
2952 return false;
2953
2954
2955
2956
2957
2958 tcp_update_rtt_min(sk, ca_rtt_us);
2959 tcp_rtt_estimator(sk, seq_rtt_us);
2960 tcp_set_rto(sk);
2961
2962
2963 inet_csk(sk)->icsk_backoff = 0;
2964 return true;
2965}
2966
2967
2968void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
2969{
2970 struct rate_sample rs;
2971 long rtt_us = -1L;
2972
2973 if (req && !req->num_retrans && tcp_rsk(req)->snt_synack)
2974 rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack);
2975
2976 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us, &rs);
2977}
2978
2979
2980static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
2981{
2982 const struct inet_connection_sock *icsk = inet_csk(sk);
2983
2984 icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
2985 tcp_sk(sk)->snd_cwnd_stamp = tcp_jiffies32;
2986}
2987
2988
2989
2990
2991void tcp_rearm_rto(struct sock *sk)
2992{
2993 const struct inet_connection_sock *icsk = inet_csk(sk);
2994 struct tcp_sock *tp = tcp_sk(sk);
2995
2996
2997
2998
2999 if (tp->fastopen_rsk)
3000 return;
3001
3002 if (!tp->packets_out) {
3003 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
3004 } else {
3005 u32 rto = inet_csk(sk)->icsk_rto;
3006
3007 if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
3008 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
3009 s64 delta_us = tcp_rto_delta_us(sk);
3010
3011
3012
3013 rto = usecs_to_jiffies(max_t(int, delta_us, 1));
3014 }
3015 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
3016 TCP_RTO_MAX);
3017 }
3018}
3019
3020
3021static void tcp_set_xmit_timer(struct sock *sk)
3022{
3023 if (!tcp_schedule_loss_probe(sk))
3024 tcp_rearm_rto(sk);
3025}
3026
3027
3028static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
3029{
3030 struct tcp_sock *tp = tcp_sk(sk);
3031 u32 packets_acked;
3032
3033 BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
3034
3035 packets_acked = tcp_skb_pcount(skb);
3036 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
3037 return 0;
3038 packets_acked -= tcp_skb_pcount(skb);
3039
3040 if (packets_acked) {
3041 BUG_ON(tcp_skb_pcount(skb) == 0);
3042 BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
3043 }
3044
3045 return packets_acked;
3046}
3047
3048static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
3049 u32 prior_snd_una)
3050{
3051 const struct skb_shared_info *shinfo;
3052
3053
3054 if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
3055 return;
3056
3057 shinfo = skb_shinfo(skb);
3058 if (!before(shinfo->tskey, prior_snd_una) &&
3059 before(shinfo->tskey, tcp_sk(sk)->snd_una))
3060 __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
3061}
3062
3063
3064
3065
3066
3067static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3068 u32 prior_snd_una, int *acked,
3069 struct tcp_sacktag_state *sack)
3070{
3071 const struct inet_connection_sock *icsk = inet_csk(sk);
3072 u64 first_ackt, last_ackt;
3073 struct tcp_sock *tp = tcp_sk(sk);
3074 u32 prior_sacked = tp->sacked_out;
3075 u32 reord = tp->packets_out;
3076 bool fully_acked = true;
3077 long sack_rtt_us = -1L;
3078 long seq_rtt_us = -1L;
3079 long ca_rtt_us = -1L;
3080 struct sk_buff *skb;
3081 u32 pkts_acked = 0;
3082 u32 last_in_flight = 0;
3083 bool rtt_update;
3084 int flag = 0;
3085
3086 first_ackt = 0;
3087
3088 while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
3089 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
3090 u8 sacked = scb->sacked;
3091 u32 acked_pcount;
3092
3093 tcp_ack_tstamp(sk, skb, prior_snd_una);
3094
3095
3096 if (after(scb->end_seq, tp->snd_una)) {
3097 if (tcp_skb_pcount(skb) == 1 ||
3098 !after(tp->snd_una, scb->seq))
3099 break;
3100
3101 acked_pcount = tcp_tso_acked(sk, skb);
3102 if (!acked_pcount)
3103 break;
3104 fully_acked = false;
3105 } else {
3106
3107 prefetchw(skb->next);
3108 acked_pcount = tcp_skb_pcount(skb);
3109 }
3110
3111 if (unlikely(sacked & TCPCB_RETRANS)) {
3112 if (sacked & TCPCB_SACKED_RETRANS)
3113 tp->retrans_out -= acked_pcount;
3114 flag |= FLAG_RETRANS_DATA_ACKED;
3115 } else if (!(sacked & TCPCB_SACKED_ACKED)) {
3116 last_ackt = skb->skb_mstamp;
3117 WARN_ON_ONCE(last_ackt == 0);
3118 if (!first_ackt)
3119 first_ackt = last_ackt;
3120
3121 last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
3122 reord = min(pkts_acked, reord);
3123 if (!after(scb->end_seq, tp->high_seq))
3124 flag |= FLAG_ORIG_SACK_ACKED;
3125 }
3126
3127 if (sacked & TCPCB_SACKED_ACKED) {
3128 tp->sacked_out -= acked_pcount;
3129 } else if (tcp_is_sack(tp)) {
3130 tp->delivered += acked_pcount;
3131 if (!tcp_skb_spurious_retrans(tp, skb))
3132 tcp_rack_advance(tp, sacked, scb->end_seq,
3133 skb->skb_mstamp);
3134 }
3135 if (sacked & TCPCB_LOST)
3136 tp->lost_out -= acked_pcount;
3137
3138 tp->packets_out -= acked_pcount;
3139 pkts_acked += acked_pcount;
3140 tcp_rate_skb_delivered(sk, skb, sack->rate);
3141
3142
3143
3144
3145
3146
3147
3148
3149 if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
3150 flag |= FLAG_DATA_ACKED;
3151 } else {
3152 flag |= FLAG_SYN_ACKED;
3153 tp->retrans_stamp = 0;
3154 }
3155
3156 if (!fully_acked)
3157 break;
3158
3159 tcp_unlink_write_queue(skb, sk);
3160 sk_wmem_free_skb(sk, skb);
3161 if (unlikely(skb == tp->retransmit_skb_hint))
3162 tp->retransmit_skb_hint = NULL;
3163 if (unlikely(skb == tp->lost_skb_hint))
3164 tp->lost_skb_hint = NULL;
3165 }
3166
3167 if (!skb)
3168 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
3169
3170 if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
3171 tp->snd_up = tp->snd_una;
3172
3173 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
3174 flag |= FLAG_SACK_RENEGING;
3175
3176 if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
3177 seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
3178 ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt);
3179 }
3180 if (sack->first_sackt) {
3181 sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt);
3182 ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt);
3183 }
3184 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
3185 ca_rtt_us, sack->rate);
3186
3187 if (flag & FLAG_ACKED) {
3188 flag |= FLAG_SET_XMIT_TIMER;
3189 if (unlikely(icsk->icsk_mtup.probe_size &&
3190 !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
3191 tcp_mtup_probe_success(sk);
3192 }
3193
3194 if (tcp_is_reno(tp)) {
3195 tcp_remove_reno_sacks(sk, pkts_acked);
3196 } else {
3197 int delta;
3198
3199
3200 if (reord < prior_fackets && reord <= tp->fackets_out)
3201 tcp_update_reordering(sk, tp->fackets_out - reord, 0);
3202
3203 delta = tcp_is_fack(tp) ? pkts_acked :
3204 prior_sacked - tp->sacked_out;
3205 tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
3206 }
3207
3208 tp->fackets_out -= min(pkts_acked, tp->fackets_out);
3209
3210 } else if (skb && rtt_update && sack_rtt_us >= 0 &&
3211 sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) {
3212
3213
3214
3215
3216 flag |= FLAG_SET_XMIT_TIMER;
3217 }
3218
3219 if (icsk->icsk_ca_ops->pkts_acked) {
3220 struct ack_sample sample = { .pkts_acked = pkts_acked,
3221 .rtt_us = sack->rate->rtt_us,
3222 .in_flight = last_in_flight };
3223
3224 icsk->icsk_ca_ops->pkts_acked(sk, &sample);
3225 }
3226
3227#if FASTRETRANS_DEBUG > 0
3228 WARN_ON((int)tp->sacked_out < 0);
3229 WARN_ON((int)tp->lost_out < 0);
3230 WARN_ON((int)tp->retrans_out < 0);
3231 if (!tp->packets_out && tcp_is_sack(tp)) {
3232 icsk = inet_csk(sk);
3233 if (tp->lost_out) {
3234 pr_debug("Leak l=%u %d\n",
3235 tp->lost_out, icsk->icsk_ca_state);
3236 tp->lost_out = 0;
3237 }
3238 if (tp->sacked_out) {
3239 pr_debug("Leak s=%u %d\n",
3240 tp->sacked_out, icsk->icsk_ca_state);
3241 tp->sacked_out = 0;
3242 }
3243 if (tp->retrans_out) {
3244 pr_debug("Leak r=%u %d\n",
3245 tp->retrans_out, icsk->icsk_ca_state);
3246 tp->retrans_out = 0;
3247 }
3248 }
3249#endif
3250 *acked = pkts_acked;
3251 return flag;
3252}
3253
3254static void tcp_ack_probe(struct sock *sk)
3255{
3256 const struct tcp_sock *tp = tcp_sk(sk);
3257 struct inet_connection_sock *icsk = inet_csk(sk);
3258
3259
3260
3261 if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
3262 icsk->icsk_backoff = 0;
3263 inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
3264
3265
3266
3267 } else {
3268 unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
3269
3270 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3271 when, TCP_RTO_MAX);
3272 }
3273}
3274
3275static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
3276{
3277 return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
3278 inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
3279}
3280
3281
3282static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
3283{
3284
3285
3286
3287
3288
3289
3290 if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering)
3291 return flag & FLAG_FORWARD_PROGRESS;
3292
3293 return flag & FLAG_DATA_ACKED;
3294}
3295
3296
3297
3298
3299
3300
3301static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
3302 int flag, const struct rate_sample *rs)
3303{
3304 const struct inet_connection_sock *icsk = inet_csk(sk);
3305
3306 if (icsk->icsk_ca_ops->cong_control) {
3307 icsk->icsk_ca_ops->cong_control(sk, rs);
3308 return;
3309 }
3310
3311 if (tcp_in_cwnd_reduction(sk)) {
3312
3313 tcp_cwnd_reduction(sk, acked_sacked, flag);
3314 } else if (tcp_may_raise_cwnd(sk, flag)) {
3315
3316 tcp_cong_avoid(sk, ack, acked_sacked);
3317 }
3318 tcp_update_pacing_rate(sk);
3319}
3320
3321
3322
3323
3324static inline bool tcp_may_update_window(const struct tcp_sock *tp,
3325 const u32 ack, const u32 ack_seq,
3326 const u32 nwin)
3327{
3328 return after(ack, tp->snd_una) ||
3329 after(ack_seq, tp->snd_wl1) ||
3330 (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
3331}
3332
3333
3334static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
3335{
3336 u32 delta = ack - tp->snd_una;
3337
3338 sock_owned_by_me((struct sock *)tp);
3339 tp->bytes_acked += delta;
3340 tp->snd_una = ack;
3341}
3342
3343
3344static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
3345{
3346 u32 delta = seq - tp->rcv_nxt;
3347
3348 sock_owned_by_me((struct sock *)tp);
3349 tp->bytes_received += delta;
3350 tp->rcv_nxt = seq;
3351}
3352
3353
3354
3355
3356
3357
3358static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack,
3359 u32 ack_seq)
3360{
3361 struct tcp_sock *tp = tcp_sk(sk);
3362 int flag = 0;
3363 u32 nwin = ntohs(tcp_hdr(skb)->window);
3364
3365 if (likely(!tcp_hdr(skb)->syn))
3366 nwin <<= tp->rx_opt.snd_wscale;
3367
3368 if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
3369 flag |= FLAG_WIN_UPDATE;
3370 tcp_update_wl(tp, ack_seq);
3371
3372 if (tp->snd_wnd != nwin) {
3373 tp->snd_wnd = nwin;
3374
3375
3376
3377
3378 tp->pred_flags = 0;
3379 tcp_fast_path_check(sk);
3380
3381 if (tcp_send_head(sk))
3382 tcp_slow_start_after_idle_check(sk);
3383
3384 if (nwin > tp->max_window) {
3385 tp->max_window = nwin;
3386 tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
3387 }
3388 }
3389 }
3390
3391 tcp_snd_una_update(tp, ack);
3392
3393 return flag;
3394}
3395
3396static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
3397 u32 *last_oow_ack_time)
3398{
3399 if (*last_oow_ack_time) {
3400 s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time);
3401
3402 if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
3403 NET_INC_STATS(net, mib_idx);
3404 return true;
3405 }
3406 }
3407
3408 *last_oow_ack_time = tcp_jiffies32;
3409
3410 return false;
3411}
3412
3413
3414
3415
3416
3417
3418
3419
3420bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
3421 int mib_idx, u32 *last_oow_ack_time)
3422{
3423
3424 if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
3425 !tcp_hdr(skb)->syn)
3426 return false;
3427
3428 return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
3429}
3430
3431
3432static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
3433{
3434
3435 static u32 challenge_timestamp;
3436 static unsigned int challenge_count;
3437 struct tcp_sock *tp = tcp_sk(sk);
3438 u32 count, now;
3439
3440
3441 if (__tcp_oow_rate_limited(sock_net(sk),
3442 LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
3443 &tp->last_oow_ack_time))
3444 return;
3445
3446
3447 now = jiffies / HZ;
3448 if (now != challenge_timestamp) {
3449 u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1;
3450
3451 challenge_timestamp = now;
3452 WRITE_ONCE(challenge_count, half +
3453 prandom_u32_max(sysctl_tcp_challenge_ack_limit));
3454 }
3455 count = READ_ONCE(challenge_count);
3456 if (count > 0) {
3457 WRITE_ONCE(challenge_count, count - 1);
3458 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
3459 tcp_send_ack(sk);
3460 }
3461}
3462
3463static void tcp_store_ts_recent(struct tcp_sock *tp)
3464{
3465 tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
3466 tp->rx_opt.ts_recent_stamp = get_seconds();
3467}
3468
3469static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
3470{
3471 if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
3472
3473
3474
3475
3476
3477
3478
3479 if (tcp_paws_check(&tp->rx_opt, 0))
3480 tcp_store_ts_recent(tp);
3481 }
3482}
3483
3484
3485
3486
3487
3488
3489static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
3490{
3491 struct tcp_sock *tp = tcp_sk(sk);
3492
3493 if (before(ack, tp->tlp_high_seq))
3494 return;
3495
3496 if (flag & FLAG_DSACKING_ACK) {
3497
3498 tp->tlp_high_seq = 0;
3499 } else if (after(ack, tp->tlp_high_seq)) {
3500
3501
3502
3503 tcp_init_cwnd_reduction(sk);
3504 tcp_set_ca_state(sk, TCP_CA_CWR);
3505 tcp_end_cwnd_reduction(sk);
3506 tcp_try_keep_open(sk);
3507 NET_INC_STATS(sock_net(sk),
3508 LINUX_MIB_TCPLOSSPROBERECOVERY);
3509 } else if (!(flag & (FLAG_SND_UNA_ADVANCED |
3510 FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
3511
3512 tp->tlp_high_seq = 0;
3513 }
3514}
3515
3516static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
3517{
3518 const struct inet_connection_sock *icsk = inet_csk(sk);
3519
3520 if (icsk->icsk_ca_ops->in_ack_event)
3521 icsk->icsk_ca_ops->in_ack_event(sk, flags);
3522}
3523
3524
3525
3526
3527
3528static void tcp_xmit_recovery(struct sock *sk, int rexmit)
3529{
3530 struct tcp_sock *tp = tcp_sk(sk);
3531
3532 if (rexmit == REXMIT_NONE)
3533 return;
3534
3535 if (unlikely(rexmit == 2)) {
3536 __tcp_push_pending_frames(sk, tcp_current_mss(sk),
3537 TCP_NAGLE_OFF);
3538 if (after(tp->snd_nxt, tp->high_seq))
3539 return;
3540 tp->frto = 0;
3541 }
3542 tcp_xmit_retransmit_queue(sk);
3543}
3544
3545
3546static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3547{
3548 struct inet_connection_sock *icsk = inet_csk(sk);
3549 struct tcp_sock *tp = tcp_sk(sk);
3550 struct tcp_sacktag_state sack_state;
3551 struct rate_sample rs = { .prior_delivered = 0 };
3552 u32 prior_snd_una = tp->snd_una;
3553 u32 ack_seq = TCP_SKB_CB(skb)->seq;
3554 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3555 bool is_dupack = false;
3556 u32 prior_fackets;
3557 int prior_packets = tp->packets_out;
3558 u32 delivered = tp->delivered;
3559 u32 lost = tp->lost;
3560 int acked = 0;
3561 int rexmit = REXMIT_NONE;
3562
3563 sack_state.first_sackt = 0;
3564 sack_state.rate = &rs;
3565
3566
3567 prefetchw(sk->sk_write_queue.next);
3568
3569
3570
3571
3572 if (before(ack, prior_snd_una)) {
3573
3574 if (before(ack, prior_snd_una - tp->max_window)) {
3575 if (!(flag & FLAG_NO_CHALLENGE_ACK))
3576 tcp_send_challenge_ack(sk, skb);
3577 return -1;
3578 }
3579 goto old_ack;
3580 }
3581
3582
3583
3584
3585 if (after(ack, tp->snd_nxt))
3586 goto invalid_ack;
3587
3588 if (after(ack, prior_snd_una)) {
3589 flag |= FLAG_SND_UNA_ADVANCED;
3590 icsk->icsk_retransmits = 0;
3591 }
3592
3593 prior_fackets = tp->fackets_out;
3594 rs.prior_in_flight = tcp_packets_in_flight(tp);
3595
3596
3597
3598
3599 if (flag & FLAG_UPDATE_TS_RECENT)
3600 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
3601
3602 if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
3603
3604
3605
3606
3607 tcp_update_wl(tp, ack_seq);
3608 tcp_snd_una_update(tp, ack);
3609 flag |= FLAG_WIN_UPDATE;
3610
3611 tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
3612
3613 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
3614 } else {
3615 u32 ack_ev_flags = CA_ACK_SLOWPATH;
3616
3617 if (ack_seq != TCP_SKB_CB(skb)->end_seq)
3618 flag |= FLAG_DATA;
3619 else
3620 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
3621
3622 flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
3623
3624 if (TCP_SKB_CB(skb)->sacked)
3625 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3626 &sack_state);
3627
3628 if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
3629 flag |= FLAG_ECE;
3630 ack_ev_flags |= CA_ACK_ECE;
3631 }
3632
3633 if (flag & FLAG_WIN_UPDATE)
3634 ack_ev_flags |= CA_ACK_WIN_UPDATE;
3635
3636 tcp_in_ack_event(sk, ack_ev_flags);
3637 }
3638
3639
3640
3641
3642 sk->sk_err_soft = 0;
3643 icsk->icsk_probes_out = 0;
3644 tp->rcv_tstamp = tcp_jiffies32;
3645 if (!prior_packets)
3646 goto no_queue;
3647
3648
3649 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
3650 &sack_state);
3651
3652 if (tp->tlp_high_seq)
3653 tcp_process_tlp_ack(sk, ack, flag);
3654
3655 if (flag & FLAG_SET_XMIT_TIMER)
3656 tcp_set_xmit_timer(sk);
3657
3658 if (tcp_ack_is_dubious(sk, flag)) {
3659 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3660 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
3661 }
3662
3663 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
3664 sk_dst_confirm(sk);
3665
3666 delivered = tp->delivered - delivered;
3667 lost = tp->lost - lost;
3668 tcp_rate_gen(sk, delivered, lost, sack_state.rate);
3669 tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
3670 tcp_xmit_recovery(sk, rexmit);
3671 return 1;
3672
3673no_queue:
3674
3675 if (flag & FLAG_DSACKING_ACK)
3676 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
3677
3678
3679
3680
3681 if (tcp_send_head(sk))
3682 tcp_ack_probe(sk);
3683
3684 if (tp->tlp_high_seq)
3685 tcp_process_tlp_ack(sk, ack, flag);
3686 return 1;
3687
3688invalid_ack:
3689 SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3690 return -1;
3691
3692old_ack:
3693
3694
3695
3696 if (TCP_SKB_CB(skb)->sacked) {
3697 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3698 &sack_state);
3699 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
3700 tcp_xmit_recovery(sk, rexmit);
3701 }
3702
3703 SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3704 return 0;
3705}
3706
3707static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
3708 bool syn, struct tcp_fastopen_cookie *foc,
3709 bool exp_opt)
3710{
3711
3712 if (!foc || !syn || len < 0 || (len & 1))
3713 return;
3714
3715 if (len >= TCP_FASTOPEN_COOKIE_MIN &&
3716 len <= TCP_FASTOPEN_COOKIE_MAX)
3717 memcpy(foc->val, cookie, len);
3718 else if (len != 0)
3719 len = -1;
3720 foc->len = len;
3721 foc->exp = exp_opt;
3722}
3723
3724
3725
3726
3727
3728void tcp_parse_options(const struct net *net,
3729 const struct sk_buff *skb,
3730 struct tcp_options_received *opt_rx, int estab,
3731 struct tcp_fastopen_cookie *foc)
3732{
3733 const unsigned char *ptr;
3734 const struct tcphdr *th = tcp_hdr(skb);
3735 int length = (th->doff * 4) - sizeof(struct tcphdr);
3736
3737 ptr = (const unsigned char *)(th + 1);
3738 opt_rx->saw_tstamp = 0;
3739
3740 while (length > 0) {
3741 int opcode = *ptr++;
3742 int opsize;
3743
3744 switch (opcode) {
3745 case TCPOPT_EOL:
3746 return;
3747 case TCPOPT_NOP:
3748 length--;
3749 continue;
3750 default:
3751 opsize = *ptr++;
3752 if (opsize < 2)
3753 return;
3754 if (opsize > length)
3755 return;
3756 switch (opcode) {
3757 case TCPOPT_MSS:
3758 if (opsize == TCPOLEN_MSS && th->syn && !estab) {
3759 u16 in_mss = get_unaligned_be16(ptr);
3760 if (in_mss) {
3761 if (opt_rx->user_mss &&
3762 opt_rx->user_mss < in_mss)
3763 in_mss = opt_rx->user_mss;
3764 opt_rx->mss_clamp = in_mss;
3765 }
3766 }
3767 break;
3768 case TCPOPT_WINDOW:
3769 if (opsize == TCPOLEN_WINDOW && th->syn &&
3770 !estab && net->ipv4.sysctl_tcp_window_scaling) {
3771 __u8 snd_wscale = *(__u8 *)ptr;
3772 opt_rx->wscale_ok = 1;
3773 if (snd_wscale > TCP_MAX_WSCALE) {
3774 net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n",
3775 __func__,
3776 snd_wscale,
3777 TCP_MAX_WSCALE);
3778 snd_wscale = TCP_MAX_WSCALE;
3779 }
3780 opt_rx->snd_wscale = snd_wscale;
3781 }
3782 break;
3783 case TCPOPT_TIMESTAMP:
3784 if ((opsize == TCPOLEN_TIMESTAMP) &&
3785 ((estab && opt_rx->tstamp_ok) ||
3786 (!estab && net->ipv4.sysctl_tcp_timestamps))) {
3787 opt_rx->saw_tstamp = 1;
3788 opt_rx->rcv_tsval = get_unaligned_be32(ptr);
3789 opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
3790 }
3791 break;
3792 case TCPOPT_SACK_PERM:
3793 if (opsize == TCPOLEN_SACK_PERM && th->syn &&
3794 !estab && net->ipv4.sysctl_tcp_sack) {
3795 opt_rx->sack_ok = TCP_SACK_SEEN;
3796 tcp_sack_reset(opt_rx);
3797 }
3798 break;
3799
3800 case TCPOPT_SACK:
3801 if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
3802 !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
3803 opt_rx->sack_ok) {
3804 TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
3805 }
3806 break;
3807#ifdef CONFIG_TCP_MD5SIG
3808 case TCPOPT_MD5SIG:
3809
3810
3811
3812
3813 break;
3814#endif
3815 case TCPOPT_FASTOPEN:
3816 tcp_parse_fastopen_option(
3817 opsize - TCPOLEN_FASTOPEN_BASE,
3818 ptr, th->syn, foc, false);
3819 break;
3820
3821 case TCPOPT_EXP:
3822
3823
3824
3825 if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
3826 get_unaligned_be16(ptr) ==
3827 TCPOPT_FASTOPEN_MAGIC)
3828 tcp_parse_fastopen_option(opsize -
3829 TCPOLEN_EXP_FASTOPEN_BASE,
3830 ptr + 2, th->syn, foc, true);
3831 break;
3832
3833 }
3834 ptr += opsize-2;
3835 length -= opsize;
3836 }
3837 }
3838}
3839EXPORT_SYMBOL(tcp_parse_options);
3840
3841static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
3842{
3843 const __be32 *ptr = (const __be32 *)(th + 1);
3844
3845 if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
3846 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
3847 tp->rx_opt.saw_tstamp = 1;
3848 ++ptr;
3849 tp->rx_opt.rcv_tsval = ntohl(*ptr);
3850 ++ptr;
3851 if (*ptr)
3852 tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
3853 else
3854 tp->rx_opt.rcv_tsecr = 0;
3855 return true;
3856 }
3857 return false;
3858}
3859
3860
3861
3862
3863static bool tcp_fast_parse_options(const struct net *net,
3864 const struct sk_buff *skb,
3865 const struct tcphdr *th, struct tcp_sock *tp)
3866{
3867
3868
3869
3870 if (th->doff == (sizeof(*th) / 4)) {
3871 tp->rx_opt.saw_tstamp = 0;
3872 return false;
3873 } else if (tp->rx_opt.tstamp_ok &&
3874 th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
3875 if (tcp_parse_aligned_timestamp(tp, th))
3876 return true;
3877 }
3878
3879 tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
3880 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
3881 tp->rx_opt.rcv_tsecr -= tp->tsoffset;
3882
3883 return true;
3884}
3885
3886#ifdef CONFIG_TCP_MD5SIG
3887
3888
3889
3890const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
3891{
3892 int length = (th->doff << 2) - sizeof(*th);
3893 const u8 *ptr = (const u8 *)(th + 1);
3894
3895
3896 if (length < TCPOLEN_MD5SIG)
3897 return NULL;
3898
3899 while (length > 0) {
3900 int opcode = *ptr++;
3901 int opsize;
3902
3903 switch (opcode) {
3904 case TCPOPT_EOL:
3905 return NULL;
3906 case TCPOPT_NOP:
3907 length--;
3908 continue;
3909 default:
3910 opsize = *ptr++;
3911 if (opsize < 2 || opsize > length)
3912 return NULL;
3913 if (opcode == TCPOPT_MD5SIG)
3914 return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
3915 }
3916 ptr += opsize - 2;
3917 length -= opsize;
3918 }
3919 return NULL;
3920}
3921EXPORT_SYMBOL(tcp_parse_md5sig_option);
3922#endif
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
3948{
3949 const struct tcp_sock *tp = tcp_sk(sk);
3950 const struct tcphdr *th = tcp_hdr(skb);
3951 u32 seq = TCP_SKB_CB(skb)->seq;
3952 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3953
3954 return (
3955 (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
3956
3957
3958 ack == tp->snd_una &&
3959
3960
3961 !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
3962
3963
3964 (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
3965}
3966
3967static inline bool tcp_paws_discard(const struct sock *sk,
3968 const struct sk_buff *skb)
3969{
3970 const struct tcp_sock *tp = tcp_sk(sk);
3971
3972 return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
3973 !tcp_disordered_ack(sk, skb);
3974}
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
3990{
3991 return !before(end_seq, tp->rcv_wup) &&
3992 !after(seq, tp->rcv_nxt + tcp_receive_window(tp));
3993}
3994
3995
3996void tcp_reset(struct sock *sk)
3997{
3998
3999 switch (sk->sk_state) {
4000 case TCP_SYN_SENT:
4001 sk->sk_err = ECONNREFUSED;
4002 break;
4003 case TCP_CLOSE_WAIT:
4004 sk->sk_err = EPIPE;
4005 break;
4006 case TCP_CLOSE:
4007 return;
4008 default:
4009 sk->sk_err = ECONNRESET;
4010 }
4011
4012 smp_wmb();
4013
4014 tcp_done(sk);
4015
4016 if (!sock_flag(sk, SOCK_DEAD))
4017 sk->sk_error_report(sk);
4018}
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034void tcp_fin(struct sock *sk)
4035{
4036 struct tcp_sock *tp = tcp_sk(sk);
4037
4038 inet_csk_schedule_ack(sk);
4039
4040 sk->sk_shutdown |= RCV_SHUTDOWN;
4041 sock_set_flag(sk, SOCK_DONE);
4042
4043 switch (sk->sk_state) {
4044 case TCP_SYN_RECV:
4045 case TCP_ESTABLISHED:
4046
4047 tcp_set_state(sk, TCP_CLOSE_WAIT);
4048 inet_csk(sk)->icsk_ack.pingpong = 1;
4049 break;
4050
4051 case TCP_CLOSE_WAIT:
4052 case TCP_CLOSING:
4053
4054
4055
4056 break;
4057 case TCP_LAST_ACK:
4058
4059 break;
4060
4061 case TCP_FIN_WAIT1:
4062
4063
4064
4065
4066 tcp_send_ack(sk);
4067 tcp_set_state(sk, TCP_CLOSING);
4068 break;
4069 case TCP_FIN_WAIT2:
4070
4071 tcp_send_ack(sk);
4072 tcp_time_wait(sk, TCP_TIME_WAIT, 0);
4073 break;
4074 default:
4075
4076
4077
4078 pr_err("%s: Impossible, sk->sk_state=%d\n",
4079 __func__, sk->sk_state);
4080 break;
4081 }
4082
4083
4084
4085
4086 skb_rbtree_purge(&tp->out_of_order_queue);
4087 if (tcp_is_sack(tp))
4088 tcp_sack_reset(&tp->rx_opt);
4089 sk_mem_reclaim(sk);
4090
4091 if (!sock_flag(sk, SOCK_DEAD)) {
4092 sk->sk_state_change(sk);
4093
4094
4095 if (sk->sk_shutdown == SHUTDOWN_MASK ||
4096 sk->sk_state == TCP_CLOSE)
4097 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
4098 else
4099 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
4100 }
4101}
4102
4103static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
4104 u32 end_seq)
4105{
4106 if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
4107 if (before(seq, sp->start_seq))
4108 sp->start_seq = seq;
4109 if (after(end_seq, sp->end_seq))
4110 sp->end_seq = end_seq;
4111 return true;
4112 }
4113 return false;
4114}
4115
4116static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
4117{
4118 struct tcp_sock *tp = tcp_sk(sk);
4119
4120 if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
4121 int mib_idx;
4122
4123 if (before(seq, tp->rcv_nxt))
4124 mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
4125 else
4126 mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
4127
4128 NET_INC_STATS(sock_net(sk), mib_idx);
4129
4130 tp->rx_opt.dsack = 1;
4131 tp->duplicate_sack[0].start_seq = seq;
4132 tp->duplicate_sack[0].end_seq = end_seq;
4133 }
4134}
4135
4136static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
4137{
4138 struct tcp_sock *tp = tcp_sk(sk);
4139
4140 if (!tp->rx_opt.dsack)
4141 tcp_dsack_set(sk, seq, end_seq);
4142 else
4143 tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
4144}
4145
4146static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
4147{
4148 struct tcp_sock *tp = tcp_sk(sk);
4149
4150 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
4151 before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4152 NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4153 tcp_enter_quickack_mode(sk);
4154
4155 if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
4156 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
4157
4158 if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
4159 end_seq = tp->rcv_nxt;
4160 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
4161 }
4162 }
4163
4164 tcp_send_ack(sk);
4165}
4166
4167
4168
4169
4170static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
4171{
4172 int this_sack;
4173 struct tcp_sack_block *sp = &tp->selective_acks[0];
4174 struct tcp_sack_block *swalk = sp + 1;
4175
4176
4177
4178
4179 for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
4180 if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
4181 int i;
4182
4183
4184
4185
4186 tp->rx_opt.num_sacks--;
4187 for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
4188 sp[i] = sp[i + 1];
4189 continue;
4190 }
4191 this_sack++, swalk++;
4192 }
4193}
4194
4195static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
4196{
4197 struct tcp_sock *tp = tcp_sk(sk);
4198 struct tcp_sack_block *sp = &tp->selective_acks[0];
4199 int cur_sacks = tp->rx_opt.num_sacks;
4200 int this_sack;
4201
4202 if (!cur_sacks)
4203 goto new_sack;
4204
4205 for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
4206 if (tcp_sack_extend(sp, seq, end_seq)) {
4207
4208 for (; this_sack > 0; this_sack--, sp--)
4209 swap(*sp, *(sp - 1));
4210 if (cur_sacks > 1)
4211 tcp_sack_maybe_coalesce(tp);
4212 return;
4213 }
4214 }
4215
4216
4217
4218
4219
4220
4221
4222 if (this_sack >= TCP_NUM_SACKS) {
4223 this_sack--;
4224 tp->rx_opt.num_sacks--;
4225 sp--;
4226 }
4227 for (; this_sack > 0; this_sack--, sp--)
4228 *sp = *(sp - 1);
4229
4230new_sack:
4231
4232 sp->start_seq = seq;
4233 sp->end_seq = end_seq;
4234 tp->rx_opt.num_sacks++;
4235}
4236
4237
4238
4239static void tcp_sack_remove(struct tcp_sock *tp)
4240{
4241 struct tcp_sack_block *sp = &tp->selective_acks[0];
4242 int num_sacks = tp->rx_opt.num_sacks;
4243 int this_sack;
4244
4245
4246 if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
4247 tp->rx_opt.num_sacks = 0;
4248 return;
4249 }
4250
4251 for (this_sack = 0; this_sack < num_sacks;) {
4252
4253 if (!before(tp->rcv_nxt, sp->start_seq)) {
4254 int i;
4255
4256
4257 WARN_ON(before(tp->rcv_nxt, sp->end_seq));
4258
4259
4260 for (i = this_sack+1; i < num_sacks; i++)
4261 tp->selective_acks[i-1] = tp->selective_acks[i];
4262 num_sacks--;
4263 continue;
4264 }
4265 this_sack++;
4266 sp++;
4267 }
4268 tp->rx_opt.num_sacks = num_sacks;
4269}
4270
4271enum tcp_queue {
4272 OOO_QUEUE,
4273 RCV_QUEUE,
4274};
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290static bool tcp_try_coalesce(struct sock *sk,
4291 enum tcp_queue dest,
4292 struct sk_buff *to,
4293 struct sk_buff *from,
4294 bool *fragstolen)
4295{
4296 int delta;
4297
4298 *fragstolen = false;
4299
4300
4301 if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
4302 return false;
4303
4304 if (!skb_try_coalesce(to, from, fragstolen, &delta))
4305 return false;
4306
4307 atomic_add(delta, &sk->sk_rmem_alloc);
4308 sk_mem_charge(sk, delta);
4309 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
4310 TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
4311 TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
4312 TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
4313
4314 if (TCP_SKB_CB(from)->has_rxtstamp) {
4315 TCP_SKB_CB(to)->has_rxtstamp = true;
4316 if (dest == OOO_QUEUE)
4317 TCP_SKB_CB(to)->swtstamp = TCP_SKB_CB(from)->swtstamp;
4318 else
4319 to->tstamp = from->tstamp;
4320 }
4321
4322 return true;
4323}
4324
4325static void tcp_drop(struct sock *sk, struct sk_buff *skb)
4326{
4327 sk_drops_add(sk, skb);
4328 __kfree_skb(skb);
4329}
4330
4331
4332
4333
4334static void tcp_ofo_queue(struct sock *sk)
4335{
4336 struct tcp_sock *tp = tcp_sk(sk);
4337 __u32 dsack_high = tp->rcv_nxt;
4338 bool fin, fragstolen, eaten;
4339 struct sk_buff *skb, *tail;
4340 struct rb_node *p;
4341
4342 p = rb_first(&tp->out_of_order_queue);
4343 while (p) {
4344 skb = rb_entry(p, struct sk_buff, rbnode);
4345 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
4346 break;
4347
4348 if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
4349 __u32 dsack = dsack_high;
4350 if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
4351 dsack_high = TCP_SKB_CB(skb)->end_seq;
4352 tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
4353 }
4354 p = rb_next(p);
4355 rb_erase(&skb->rbnode, &tp->out_of_order_queue);
4356
4357 if (TCP_SKB_CB(skb)->has_rxtstamp)
4358 skb->tstamp = TCP_SKB_CB(skb)->swtstamp;
4359
4360 if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
4361 SOCK_DEBUG(sk, "ofo packet was already received\n");
4362 tcp_drop(sk, skb);
4363 continue;
4364 }
4365 SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
4366 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
4367 TCP_SKB_CB(skb)->end_seq);
4368
4369 tail = skb_peek_tail(&sk->sk_receive_queue);
4370 eaten = tail && tcp_try_coalesce(sk, RCV_QUEUE,
4371 tail, skb, &fragstolen);
4372 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
4373 fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
4374 if (!eaten)
4375 __skb_queue_tail(&sk->sk_receive_queue, skb);
4376 else
4377 kfree_skb_partial(skb, fragstolen);
4378
4379 if (unlikely(fin)) {
4380 tcp_fin(sk);
4381
4382
4383
4384 break;
4385 }
4386 }
4387}
4388
4389static bool tcp_prune_ofo_queue(struct sock *sk);
4390static int tcp_prune_queue(struct sock *sk);
4391
4392static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4393 unsigned int size)
4394{
4395 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
4396 !sk_rmem_schedule(sk, skb, size)) {
4397
4398 if (tcp_prune_queue(sk) < 0)
4399 return -1;
4400
4401 while (!sk_rmem_schedule(sk, skb, size)) {
4402 if (!tcp_prune_ofo_queue(sk))
4403 return -1;
4404 }
4405 }
4406 return 0;
4407}
4408
4409static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4410{
4411 struct tcp_sock *tp = tcp_sk(sk);
4412 struct rb_node **p, *q, *parent;
4413 struct sk_buff *skb1;
4414 u32 seq, end_seq;
4415 bool fragstolen;
4416
4417 tcp_ecn_check_ce(tp, skb);
4418
4419 if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
4420 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
4421 tcp_drop(sk, skb);
4422 return;
4423 }
4424
4425
4426 if (TCP_SKB_CB(skb)->has_rxtstamp)
4427 TCP_SKB_CB(skb)->swtstamp = skb->tstamp;
4428
4429
4430 tp->pred_flags = 0;
4431 inet_csk_schedule_ack(sk);
4432
4433 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
4434 seq = TCP_SKB_CB(skb)->seq;
4435 end_seq = TCP_SKB_CB(skb)->end_seq;
4436 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
4437 tp->rcv_nxt, seq, end_seq);
4438
4439 p = &tp->out_of_order_queue.rb_node;
4440 if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
4441
4442 if (tcp_is_sack(tp)) {
4443 tp->rx_opt.num_sacks = 1;
4444 tp->selective_acks[0].start_seq = seq;
4445 tp->selective_acks[0].end_seq = end_seq;
4446 }
4447 rb_link_node(&skb->rbnode, NULL, p);
4448 rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
4449 tp->ooo_last_skb = skb;
4450 goto end;
4451 }
4452
4453
4454
4455
4456 if (tcp_try_coalesce(sk, OOO_QUEUE, tp->ooo_last_skb,
4457 skb, &fragstolen)) {
4458coalesce_done:
4459 tcp_grow_window(sk, skb);
4460 kfree_skb_partial(skb, fragstolen);
4461 skb = NULL;
4462 goto add_sack;
4463 }
4464
4465 if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
4466 parent = &tp->ooo_last_skb->rbnode;
4467 p = &parent->rb_right;
4468 goto insert;
4469 }
4470
4471
4472 parent = NULL;
4473 while (*p) {
4474 parent = *p;
4475 skb1 = rb_entry(parent, struct sk_buff, rbnode);
4476 if (before(seq, TCP_SKB_CB(skb1)->seq)) {
4477 p = &parent->rb_left;
4478 continue;
4479 }
4480 if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
4481 if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
4482
4483 NET_INC_STATS(sock_net(sk),
4484 LINUX_MIB_TCPOFOMERGE);
4485 __kfree_skb(skb);
4486 skb = NULL;
4487 tcp_dsack_set(sk, seq, end_seq);
4488 goto add_sack;
4489 }
4490 if (after(seq, TCP_SKB_CB(skb1)->seq)) {
4491
4492 tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
4493 } else {
4494
4495
4496
4497 rb_replace_node(&skb1->rbnode, &skb->rbnode,
4498 &tp->out_of_order_queue);
4499 tcp_dsack_extend(sk,
4500 TCP_SKB_CB(skb1)->seq,
4501 TCP_SKB_CB(skb1)->end_seq);
4502 NET_INC_STATS(sock_net(sk),
4503 LINUX_MIB_TCPOFOMERGE);
4504 __kfree_skb(skb1);
4505 goto merge_right;
4506 }
4507 } else if (tcp_try_coalesce(sk, OOO_QUEUE, skb1,
4508 skb, &fragstolen)) {
4509 goto coalesce_done;
4510 }
4511 p = &parent->rb_right;
4512 }
4513insert:
4514
4515 rb_link_node(&skb->rbnode, parent, p);
4516 rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
4517
4518merge_right:
4519
4520 while ((q = rb_next(&skb->rbnode)) != NULL) {
4521 skb1 = rb_entry(q, struct sk_buff, rbnode);
4522
4523 if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
4524 break;
4525 if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
4526 tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
4527 end_seq);
4528 break;
4529 }
4530 rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
4531 tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
4532 TCP_SKB_CB(skb1)->end_seq);
4533 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
4534 tcp_drop(sk, skb1);
4535 }
4536
4537 if (!q)
4538 tp->ooo_last_skb = skb;
4539
4540add_sack:
4541 if (tcp_is_sack(tp))
4542 tcp_sack_new_ofo_skb(sk, seq, end_seq);
4543end:
4544 if (skb) {
4545 tcp_grow_window(sk, skb);
4546 skb_condense(skb);
4547 skb_set_owner_r(skb, sk);
4548 }
4549}
4550
4551static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
4552 bool *fragstolen)
4553{
4554 int eaten;
4555 struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
4556
4557 __skb_pull(skb, hdrlen);
4558 eaten = (tail &&
4559 tcp_try_coalesce(sk, RCV_QUEUE, tail,
4560 skb, fragstolen)) ? 1 : 0;
4561 tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
4562 if (!eaten) {
4563 __skb_queue_tail(&sk->sk_receive_queue, skb);
4564 skb_set_owner_r(skb, sk);
4565 }
4566 return eaten;
4567}
4568
4569int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
4570{
4571 struct sk_buff *skb;
4572 int err = -ENOMEM;
4573 int data_len = 0;
4574 bool fragstolen;
4575
4576 if (size == 0)
4577 return 0;
4578
4579 if (size > PAGE_SIZE) {
4580 int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);
4581
4582 data_len = npages << PAGE_SHIFT;
4583 size = data_len + (size & ~PAGE_MASK);
4584 }
4585 skb = alloc_skb_with_frags(size - data_len, data_len,
4586 PAGE_ALLOC_COSTLY_ORDER,
4587 &err, sk->sk_allocation);
4588 if (!skb)
4589 goto err;
4590
4591 skb_put(skb, size - data_len);
4592 skb->data_len = data_len;
4593 skb->len = size;
4594
4595 if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
4596 goto err_free;
4597
4598 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
4599 if (err)
4600 goto err_free;
4601
4602 TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
4603 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
4604 TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
4605
4606 if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
4607 WARN_ON_ONCE(fragstolen);
4608 __kfree_skb(skb);
4609 }
4610 return size;
4611
4612err_free:
4613 kfree_skb(skb);
4614err:
4615 return err;
4616
4617}
4618
4619static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4620{
4621 struct tcp_sock *tp = tcp_sk(sk);
4622 bool fragstolen;
4623 int eaten;
4624
4625 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
4626 __kfree_skb(skb);
4627 return;
4628 }
4629 skb_dst_drop(skb);
4630 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
4631
4632 tcp_ecn_accept_cwr(tp, skb);
4633
4634 tp->rx_opt.dsack = 0;
4635
4636
4637
4638
4639
4640 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
4641 if (tcp_receive_window(tp) == 0)
4642 goto out_of_window;
4643
4644
4645queue_and_out:
4646 if (skb_queue_len(&sk->sk_receive_queue) == 0)
4647 sk_forced_mem_schedule(sk, skb->truesize);
4648 else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
4649 goto drop;
4650
4651 eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
4652 tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
4653 if (skb->len)
4654 tcp_event_data_recv(sk, skb);
4655 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
4656 tcp_fin(sk);
4657
4658 if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
4659 tcp_ofo_queue(sk);
4660
4661
4662
4663
4664 if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
4665 inet_csk(sk)->icsk_ack.pingpong = 0;
4666 }
4667
4668 if (tp->rx_opt.num_sacks)
4669 tcp_sack_remove(tp);
4670
4671 tcp_fast_path_check(sk);
4672
4673 if (eaten > 0)
4674 kfree_skb_partial(skb, fragstolen);
4675 if (!sock_flag(sk, SOCK_DEAD))
4676 sk->sk_data_ready(sk);
4677 return;
4678 }
4679
4680 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
4681
4682 NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4683 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
4684
4685out_of_window:
4686 tcp_enter_quickack_mode(sk);
4687 inet_csk_schedule_ack(sk);
4688drop:
4689 tcp_drop(sk, skb);
4690 return;
4691 }
4692
4693
4694 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
4695 goto out_of_window;
4696
4697 tcp_enter_quickack_mode(sk);
4698
4699 if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4700
4701 SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
4702 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
4703 TCP_SKB_CB(skb)->end_seq);
4704
4705 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
4706
4707
4708
4709
4710 if (!tcp_receive_window(tp))
4711 goto out_of_window;
4712 goto queue_and_out;
4713 }
4714
4715 tcp_data_queue_ofo(sk, skb);
4716}
4717
4718static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
4719{
4720 if (list)
4721 return !skb_queue_is_last(list, skb) ? skb->next : NULL;
4722
4723 return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
4724}
4725
4726static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
4727 struct sk_buff_head *list,
4728 struct rb_root *root)
4729{
4730 struct sk_buff *next = tcp_skb_next(skb, list);
4731
4732 if (list)
4733 __skb_unlink(skb, list);
4734 else
4735 rb_erase(&skb->rbnode, root);
4736
4737 __kfree_skb(skb);
4738 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
4739
4740 return next;
4741}
4742
4743
4744static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
4745{
4746 struct rb_node **p = &root->rb_node;
4747 struct rb_node *parent = NULL;
4748 struct sk_buff *skb1;
4749
4750 while (*p) {
4751 parent = *p;
4752 skb1 = rb_entry(parent, struct sk_buff, rbnode);
4753 if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
4754 p = &parent->rb_left;
4755 else
4756 p = &parent->rb_right;
4757 }
4758 rb_link_node(&skb->rbnode, parent, p);
4759 rb_insert_color(&skb->rbnode, root);
4760}
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770static void
4771tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
4772 struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
4773{
4774 struct sk_buff *skb = head, *n;
4775 struct sk_buff_head tmp;
4776 bool end_of_skbs;
4777
4778
4779
4780
4781restart:
4782 for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
4783 n = tcp_skb_next(skb, list);
4784
4785
4786 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
4787 skb = tcp_collapse_one(sk, skb, list, root);
4788 if (!skb)
4789 break;
4790 goto restart;
4791 }
4792
4793
4794
4795
4796
4797
4798 if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
4799 (tcp_win_from_space(skb->truesize) > skb->len ||
4800 before(TCP_SKB_CB(skb)->seq, start))) {
4801 end_of_skbs = false;
4802 break;
4803 }
4804
4805 if (n && n != tail &&
4806 TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
4807 end_of_skbs = false;
4808 break;
4809 }
4810
4811
4812 start = TCP_SKB_CB(skb)->end_seq;
4813 }
4814 if (end_of_skbs ||
4815 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4816 return;
4817
4818 __skb_queue_head_init(&tmp);
4819
4820 while (before(start, end)) {
4821 int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
4822 struct sk_buff *nskb;
4823
4824 nskb = alloc_skb(copy, GFP_ATOMIC);
4825 if (!nskb)
4826 break;
4827
4828 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
4829 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
4830 if (list)
4831 __skb_queue_before(list, skb, nskb);
4832 else
4833 __skb_queue_tail(&tmp, nskb);
4834 skb_set_owner_r(nskb, sk);
4835
4836
4837 while (copy > 0) {
4838 int offset = start - TCP_SKB_CB(skb)->seq;
4839 int size = TCP_SKB_CB(skb)->end_seq - start;
4840
4841 BUG_ON(offset < 0);
4842 if (size > 0) {
4843 size = min(copy, size);
4844 if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
4845 BUG();
4846 TCP_SKB_CB(nskb)->end_seq += size;
4847 copy -= size;
4848 start += size;
4849 }
4850 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
4851 skb = tcp_collapse_one(sk, skb, list, root);
4852 if (!skb ||
4853 skb == tail ||
4854 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4855 goto end;
4856 }
4857 }
4858 }
4859end:
4860 skb_queue_walk_safe(&tmp, skb, n)
4861 tcp_rbtree_insert(root, skb);
4862}
4863
4864
4865
4866
4867static void tcp_collapse_ofo_queue(struct sock *sk)
4868{
4869 struct tcp_sock *tp = tcp_sk(sk);
4870 struct sk_buff *skb, *head;
4871 struct rb_node *p;
4872 u32 start, end;
4873
4874 p = rb_first(&tp->out_of_order_queue);
4875 skb = rb_entry_safe(p, struct sk_buff, rbnode);
4876new_range:
4877 if (!skb) {
4878 p = rb_last(&tp->out_of_order_queue);
4879
4880
4881
4882
4883 tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
4884 return;
4885 }
4886 start = TCP_SKB_CB(skb)->seq;
4887 end = TCP_SKB_CB(skb)->end_seq;
4888
4889 for (head = skb;;) {
4890 skb = tcp_skb_next(skb, NULL);
4891
4892
4893
4894
4895 if (!skb ||
4896 after(TCP_SKB_CB(skb)->seq, end) ||
4897 before(TCP_SKB_CB(skb)->end_seq, start)) {
4898 tcp_collapse(sk, NULL, &tp->out_of_order_queue,
4899 head, skb, start, end);
4900 goto new_range;
4901 }
4902
4903 if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
4904 start = TCP_SKB_CB(skb)->seq;
4905 if (after(TCP_SKB_CB(skb)->end_seq, end))
4906 end = TCP_SKB_CB(skb)->end_seq;
4907 }
4908}
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920static bool tcp_prune_ofo_queue(struct sock *sk)
4921{
4922 struct tcp_sock *tp = tcp_sk(sk);
4923 struct rb_node *node, *prev;
4924
4925 if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
4926 return false;
4927
4928 NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
4929 node = &tp->ooo_last_skb->rbnode;
4930 do {
4931 prev = rb_prev(node);
4932 rb_erase(node, &tp->out_of_order_queue);
4933 tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
4934 sk_mem_reclaim(sk);
4935 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
4936 !tcp_under_memory_pressure(sk))
4937 break;
4938 node = prev;
4939 } while (node);
4940 tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
4941
4942
4943
4944
4945
4946
4947 if (tp->rx_opt.sack_ok)
4948 tcp_sack_reset(&tp->rx_opt);
4949 return true;
4950}
4951
4952
4953
4954
4955
4956
4957
4958
4959static int tcp_prune_queue(struct sock *sk)
4960{
4961 struct tcp_sock *tp = tcp_sk(sk);
4962
4963 SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
4964
4965 NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
4966
4967 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
4968 tcp_clamp_window(sk);
4969 else if (tcp_under_memory_pressure(sk))
4970 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
4971
4972 tcp_collapse_ofo_queue(sk);
4973 if (!skb_queue_empty(&sk->sk_receive_queue))
4974 tcp_collapse(sk, &sk->sk_receive_queue, NULL,
4975 skb_peek(&sk->sk_receive_queue),
4976 NULL,
4977 tp->copied_seq, tp->rcv_nxt);
4978 sk_mem_reclaim(sk);
4979
4980 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
4981 return 0;
4982
4983
4984
4985
4986 tcp_prune_ofo_queue(sk);
4987
4988 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
4989 return 0;
4990
4991
4992
4993
4994
4995 NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
4996
4997
4998 tp->pred_flags = 0;
4999 return -1;
5000}
5001
5002static bool tcp_should_expand_sndbuf(const struct sock *sk)
5003{
5004 const struct tcp_sock *tp = tcp_sk(sk);
5005
5006
5007
5008
5009 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
5010 return false;
5011
5012
5013 if (tcp_under_memory_pressure(sk))
5014 return false;
5015
5016
5017 if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
5018 return false;
5019
5020
5021 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
5022 return false;
5023
5024 return true;
5025}
5026
5027
5028
5029
5030
5031
5032
5033static void tcp_new_space(struct sock *sk)
5034{
5035 struct tcp_sock *tp = tcp_sk(sk);
5036
5037 if (tcp_should_expand_sndbuf(sk)) {
5038 tcp_sndbuf_expand(sk);
5039 tp->snd_cwnd_stamp = tcp_jiffies32;
5040 }
5041
5042 sk->sk_write_space(sk);
5043}
5044
5045static void tcp_check_space(struct sock *sk)
5046{
5047 if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
5048 sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
5049
5050 smp_mb();
5051 if (sk->sk_socket &&
5052 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
5053 tcp_new_space(sk);
5054 if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
5055 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
5056 }
5057 }
5058}
5059
5060static inline void tcp_data_snd_check(struct sock *sk)
5061{
5062 tcp_push_pending_frames(sk);
5063 tcp_check_space(sk);
5064}
5065
5066
5067
5068
5069static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
5070{
5071 struct tcp_sock *tp = tcp_sk(sk);
5072
5073
5074 if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
5075
5076
5077
5078 __tcp_select_window(sk) >= tp->rcv_wnd) ||
5079
5080 tcp_in_quickack_mode(sk) ||
5081
5082 (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
5083
5084 tcp_send_ack(sk);
5085 } else {
5086
5087 tcp_send_delayed_ack(sk);
5088 }
5089}
5090
5091static inline void tcp_ack_snd_check(struct sock *sk)
5092{
5093 if (!inet_csk_ack_scheduled(sk)) {
5094
5095 return;
5096 }
5097 __tcp_ack_snd_check(sk, 1);
5098}
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
5111{
5112 struct tcp_sock *tp = tcp_sk(sk);
5113 u32 ptr = ntohs(th->urg_ptr);
5114
5115 if (ptr && !sysctl_tcp_stdurg)
5116 ptr--;
5117 ptr += ntohl(th->seq);
5118
5119
5120 if (after(tp->copied_seq, ptr))
5121 return;
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133 if (before(ptr, tp->rcv_nxt))
5134 return;
5135
5136
5137 if (tp->urg_data && !after(ptr, tp->urg_seq))
5138 return;
5139
5140
5141 sk_send_sigurg(sk);
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158 if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
5159 !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
5160 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
5161 tp->copied_seq++;
5162 if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
5163 __skb_unlink(skb, &sk->sk_receive_queue);
5164 __kfree_skb(skb);
5165 }
5166 }
5167
5168 tp->urg_data = TCP_URG_NOTYET;
5169 tp->urg_seq = ptr;
5170
5171
5172 tp->pred_flags = 0;
5173}
5174
5175
5176static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
5177{
5178 struct tcp_sock *tp = tcp_sk(sk);
5179
5180
5181 if (th->urg)
5182 tcp_check_urg(sk, th);
5183
5184
5185 if (tp->urg_data == TCP_URG_NOTYET) {
5186 u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
5187 th->syn;
5188
5189
5190 if (ptr < skb->len) {
5191 u8 tmp;
5192 if (skb_copy_bits(skb, ptr, &tmp, 1))
5193 BUG();
5194 tp->urg_data = TCP_URG_VALID | tmp;
5195 if (!sock_flag(sk, SOCK_DEAD))
5196 sk->sk_data_ready(sk);
5197 }
5198 }
5199}
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb)
5210{
5211 struct tcp_sock *tp = tcp_sk(sk);
5212
5213 return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
5214 (1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK |
5215 TCPF_CLOSING));
5216}
5217
5218
5219
5220
5221static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
5222 const struct tcphdr *th, int syn_inerr)
5223{
5224 struct tcp_sock *tp = tcp_sk(sk);
5225 bool rst_seq_match = false;
5226
5227
5228 if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) &&
5229 tp->rx_opt.saw_tstamp &&
5230 tcp_paws_discard(sk, skb)) {
5231 if (!th->rst) {
5232 NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
5233 if (!tcp_oow_rate_limited(sock_net(sk), skb,
5234 LINUX_MIB_TCPACKSKIPPEDPAWS,
5235 &tp->last_oow_ack_time))
5236 tcp_send_dupack(sk, skb);
5237 goto discard;
5238 }
5239
5240 }
5241
5242
5243 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
5244
5245
5246
5247
5248
5249
5250 if (!th->rst) {
5251 if (th->syn)
5252 goto syn_challenge;
5253 if (!tcp_oow_rate_limited(sock_net(sk), skb,
5254 LINUX_MIB_TCPACKSKIPPEDSEQ,
5255 &tp->last_oow_ack_time))
5256 tcp_send_dupack(sk, skb);
5257 } else if (tcp_reset_check(sk, skb)) {
5258 tcp_reset(sk);
5259 }
5260 goto discard;
5261 }
5262
5263
5264 if (th->rst) {
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt ||
5275 tcp_reset_check(sk, skb)) {
5276 rst_seq_match = true;
5277 } else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
5278 struct tcp_sack_block *sp = &tp->selective_acks[0];
5279 int max_sack = sp[0].end_seq;
5280 int this_sack;
5281
5282 for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;
5283 ++this_sack) {
5284 max_sack = after(sp[this_sack].end_seq,
5285 max_sack) ?
5286 sp[this_sack].end_seq : max_sack;
5287 }
5288
5289 if (TCP_SKB_CB(skb)->seq == max_sack)
5290 rst_seq_match = true;
5291 }
5292
5293 if (rst_seq_match)
5294 tcp_reset(sk);
5295 else {
5296
5297
5298
5299
5300 if (tp->syn_fastopen && !tp->data_segs_in &&
5301 sk->sk_state == TCP_ESTABLISHED)
5302 tcp_fastopen_active_disable(sk);
5303 tcp_send_challenge_ack(sk, skb);
5304 }
5305 goto discard;
5306 }
5307
5308
5309
5310
5311
5312
5313 if (th->syn) {
5314syn_challenge:
5315 if (syn_inerr)
5316 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
5317 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
5318 tcp_send_challenge_ack(sk, skb);
5319 goto discard;
5320 }
5321
5322 return true;
5323
5324discard:
5325 tcp_drop(sk, skb);
5326 return false;
5327}
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5353 const struct tcphdr *th)
5354{
5355 unsigned int len = skb->len;
5356 struct tcp_sock *tp = tcp_sk(sk);
5357
5358 tcp_mstamp_refresh(tp);
5359 if (unlikely(!sk->sk_rx_dst))
5360 inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376 tp->rx_opt.saw_tstamp = 0;
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387 if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
5388 TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
5389 !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
5390 int tcp_header_len = tp->tcp_header_len;
5391
5392
5393
5394
5395
5396
5397
5398 if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
5399
5400 if (!tcp_parse_aligned_timestamp(tp, th))
5401 goto slow_path;
5402
5403
5404 if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
5405 goto slow_path;
5406
5407
5408
5409
5410
5411
5412 }
5413
5414 if (len <= tcp_header_len) {
5415
5416 if (len == tcp_header_len) {
5417
5418
5419
5420
5421 if (tcp_header_len ==
5422 (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
5423 tp->rcv_nxt == tp->rcv_wup)
5424 tcp_store_ts_recent(tp);
5425
5426
5427
5428
5429 tcp_ack(sk, skb, 0);
5430 __kfree_skb(skb);
5431 tcp_data_snd_check(sk);
5432 return;
5433 } else {
5434 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
5435 goto discard;
5436 }
5437 } else {
5438 int eaten = 0;
5439 bool fragstolen = false;
5440
5441 if (tcp_checksum_complete(skb))
5442 goto csum_error;
5443
5444 if ((int)skb->truesize > sk->sk_forward_alloc)
5445 goto step5;
5446
5447
5448
5449
5450
5451 if (tcp_header_len ==
5452 (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
5453 tp->rcv_nxt == tp->rcv_wup)
5454 tcp_store_ts_recent(tp);
5455
5456 tcp_rcv_rtt_measure_ts(sk, skb);
5457
5458 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
5459
5460
5461 eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
5462 &fragstolen);
5463
5464 tcp_event_data_recv(sk, skb);
5465
5466 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
5467
5468 tcp_ack(sk, skb, FLAG_DATA);
5469 tcp_data_snd_check(sk);
5470 if (!inet_csk_ack_scheduled(sk))
5471 goto no_ack;
5472 }
5473
5474 __tcp_ack_snd_check(sk, 0);
5475no_ack:
5476 if (eaten)
5477 kfree_skb_partial(skb, fragstolen);
5478 sk->sk_data_ready(sk);
5479 return;
5480 }
5481 }
5482
5483slow_path:
5484 if (len < (th->doff << 2) || tcp_checksum_complete(skb))
5485 goto csum_error;
5486
5487 if (!th->ack && !th->rst && !th->syn)
5488 goto discard;
5489
5490
5491
5492
5493
5494 if (!tcp_validate_incoming(sk, skb, th, 1))
5495 return;
5496
5497step5:
5498 if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
5499 goto discard;
5500
5501 tcp_rcv_rtt_measure_ts(sk, skb);
5502
5503
5504 tcp_urg(sk, skb, th);
5505
5506
5507 tcp_data_queue(sk, skb);
5508
5509 tcp_data_snd_check(sk);
5510 tcp_ack_snd_check(sk);
5511 return;
5512
5513csum_error:
5514 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
5515 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
5516
5517discard:
5518 tcp_drop(sk, skb);
5519}
5520EXPORT_SYMBOL(tcp_rcv_established);
5521
5522void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5523{
5524 struct tcp_sock *tp = tcp_sk(sk);
5525 struct inet_connection_sock *icsk = inet_csk(sk);
5526
5527 tcp_set_state(sk, TCP_ESTABLISHED);
5528 icsk->icsk_ack.lrcvtime = tcp_jiffies32;
5529
5530 if (skb) {
5531 icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
5532 security_inet_conn_established(sk, skb);
5533 }
5534
5535
5536 icsk->icsk_af_ops->rebuild_header(sk);
5537
5538 tcp_init_metrics(sk);
5539 tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
5540 tcp_init_congestion_control(sk);
5541
5542
5543
5544
5545 tp->lsndtime = tcp_jiffies32;
5546
5547 tcp_init_buffer_space(sk);
5548
5549 if (sock_flag(sk, SOCK_KEEPOPEN))
5550 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
5551
5552 if (!tp->rx_opt.snd_wscale)
5553 __tcp_fast_path_on(tp, tp->snd_wnd);
5554 else
5555 tp->pred_flags = 0;
5556}
5557
5558static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5559 struct tcp_fastopen_cookie *cookie)
5560{
5561 struct tcp_sock *tp = tcp_sk(sk);
5562 struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
5563 u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
5564 bool syn_drop = false;
5565
5566 if (mss == tp->rx_opt.user_mss) {
5567 struct tcp_options_received opt;
5568
5569
5570 tcp_clear_options(&opt);
5571 opt.user_mss = opt.mss_clamp = 0;
5572 tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
5573 mss = opt.mss_clamp;
5574 }
5575
5576 if (!tp->syn_fastopen) {
5577
5578 cookie->len = -1;
5579 } else if (tp->total_retrans) {
5580
5581
5582
5583
5584
5585 syn_drop = (cookie->len < 0 && data);
5586 } else if (cookie->len < 0 && !tp->syn_data) {
5587
5588
5589
5590
5591 try_exp = tp->syn_fastopen_exp ? 2 : 1;
5592 }
5593
5594 tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
5595
5596 if (data) {
5597 tcp_for_write_queue_from(data, sk) {
5598 if (data == tcp_send_head(sk) ||
5599 __tcp_retransmit_skb(sk, data, 1))
5600 break;
5601 }
5602 tcp_rearm_rto(sk);
5603 NET_INC_STATS(sock_net(sk),
5604 LINUX_MIB_TCPFASTOPENACTIVEFAIL);
5605 return true;
5606 }
5607 tp->syn_data_acked = tp->syn_data;
5608 if (tp->syn_data_acked)
5609 NET_INC_STATS(sock_net(sk),
5610 LINUX_MIB_TCPFASTOPENACTIVE);
5611
5612 tcp_fastopen_add_skb(sk, synack);
5613
5614 return false;
5615}
5616
5617static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5618 const struct tcphdr *th)
5619{
5620 struct inet_connection_sock *icsk = inet_csk(sk);
5621 struct tcp_sock *tp = tcp_sk(sk);
5622 struct tcp_fastopen_cookie foc = { .len = -1 };
5623 int saved_clamp = tp->rx_opt.mss_clamp;
5624 bool fastopen_fail;
5625
5626 tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
5627 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
5628 tp->rx_opt.rcv_tsecr -= tp->tsoffset;
5629
5630 if (th->ack) {
5631
5632
5633
5634
5635
5636
5637
5638
5639 if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
5640 after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
5641 goto reset_and_undo;
5642
5643 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
5644 !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
5645 tcp_time_stamp(tp))) {
5646 NET_INC_STATS(sock_net(sk),
5647 LINUX_MIB_PAWSACTIVEREJECTED);
5648 goto reset_and_undo;
5649 }
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659 if (th->rst) {
5660 tcp_reset(sk);
5661 goto discard;
5662 }
5663
5664
5665
5666
5667
5668
5669
5670
5671 if (!th->syn)
5672 goto discard_and_undo;
5673
5674
5675
5676
5677
5678
5679
5680
5681 tcp_ecn_rcv_synack(tp, th);
5682
5683 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5684 tcp_ack(sk, skb, FLAG_SLOWPATH);
5685
5686
5687
5688
5689 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
5690 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
5691
5692
5693
5694
5695 tp->snd_wnd = ntohs(th->window);
5696
5697 if (!tp->rx_opt.wscale_ok) {
5698 tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
5699 tp->window_clamp = min(tp->window_clamp, 65535U);
5700 }
5701
5702 if (tp->rx_opt.saw_tstamp) {
5703 tp->rx_opt.tstamp_ok = 1;
5704 tp->tcp_header_len =
5705 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
5706 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
5707 tcp_store_ts_recent(tp);
5708 } else {
5709 tp->tcp_header_len = sizeof(struct tcphdr);
5710 }
5711
5712 if (tcp_is_sack(tp) && sysctl_tcp_fack)
5713 tcp_enable_fack(tp);
5714
5715 tcp_mtup_init(sk);
5716 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
5717 tcp_initialize_rcv_mss(sk);
5718
5719
5720
5721
5722 tp->copied_seq = tp->rcv_nxt;
5723
5724 smp_mb();
5725
5726 tcp_finish_connect(sk, skb);
5727
5728 fastopen_fail = (tp->syn_fastopen || tp->syn_data) &&
5729 tcp_rcv_fastopen_synack(sk, skb, &foc);
5730
5731 if (!sock_flag(sk, SOCK_DEAD)) {
5732 sk->sk_state_change(sk);
5733 sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
5734 }
5735 if (fastopen_fail)
5736 return -1;
5737 if (sk->sk_write_pending ||
5738 icsk->icsk_accept_queue.rskq_defer_accept ||
5739 icsk->icsk_ack.pingpong) {
5740
5741
5742
5743
5744
5745
5746
5747 inet_csk_schedule_ack(sk);
5748 tcp_enter_quickack_mode(sk);
5749 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
5750 TCP_DELACK_MAX, TCP_RTO_MAX);
5751
5752discard:
5753 tcp_drop(sk, skb);
5754 return 0;
5755 } else {
5756 tcp_send_ack(sk);
5757 }
5758 return -1;
5759 }
5760
5761
5762
5763 if (th->rst) {
5764
5765
5766
5767
5768
5769
5770 goto discard_and_undo;
5771 }
5772
5773
5774 if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
5775 tcp_paws_reject(&tp->rx_opt, 0))
5776 goto discard_and_undo;
5777
5778 if (th->syn) {
5779
5780
5781
5782
5783 tcp_set_state(sk, TCP_SYN_RECV);
5784
5785 if (tp->rx_opt.saw_tstamp) {
5786 tp->rx_opt.tstamp_ok = 1;
5787 tcp_store_ts_recent(tp);
5788 tp->tcp_header_len =
5789 sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
5790 } else {
5791 tp->tcp_header_len = sizeof(struct tcphdr);
5792 }
5793
5794 tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
5795 tp->copied_seq = tp->rcv_nxt;
5796 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
5797
5798
5799
5800
5801 tp->snd_wnd = ntohs(th->window);
5802 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
5803 tp->max_window = tp->snd_wnd;
5804
5805 tcp_ecn_rcv_syn(tp, th);
5806
5807 tcp_mtup_init(sk);
5808 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
5809 tcp_initialize_rcv_mss(sk);
5810
5811 tcp_send_synack(sk);
5812#if 0
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824 return -1;
5825#else
5826 goto discard;
5827#endif
5828 }
5829
5830
5831
5832
5833discard_and_undo:
5834 tcp_clear_options(&tp->rx_opt);
5835 tp->rx_opt.mss_clamp = saved_clamp;
5836 goto discard;
5837
5838reset_and_undo:
5839 tcp_clear_options(&tp->rx_opt);
5840 tp->rx_opt.mss_clamp = saved_clamp;
5841 return 1;
5842}
5843
5844
5845
5846
5847
5848
5849
5850
5851int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
5852{
5853 struct tcp_sock *tp = tcp_sk(sk);
5854 struct inet_connection_sock *icsk = inet_csk(sk);
5855 const struct tcphdr *th = tcp_hdr(skb);
5856 struct request_sock *req;
5857 int queued = 0;
5858 bool acceptable;
5859
5860 switch (sk->sk_state) {
5861 case TCP_CLOSE:
5862 goto discard;
5863
5864 case TCP_LISTEN:
5865 if (th->ack)
5866 return 1;
5867
5868 if (th->rst)
5869 goto discard;
5870
5871 if (th->syn) {
5872 if (th->fin)
5873 goto discard;
5874
5875
5876
5877 local_bh_disable();
5878 acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
5879 local_bh_enable();
5880
5881 if (!acceptable)
5882 return 1;
5883 consume_skb(skb);
5884 return 0;
5885 }
5886 goto discard;
5887
5888 case TCP_SYN_SENT:
5889 tp->rx_opt.saw_tstamp = 0;
5890 tcp_mstamp_refresh(tp);
5891 queued = tcp_rcv_synsent_state_process(sk, skb, th);
5892 if (queued >= 0)
5893 return queued;
5894
5895
5896 tcp_urg(sk, skb, th);
5897 __kfree_skb(skb);
5898 tcp_data_snd_check(sk);
5899 return 0;
5900 }
5901
5902 tcp_mstamp_refresh(tp);
5903 tp->rx_opt.saw_tstamp = 0;
5904 req = tp->fastopen_rsk;
5905 if (req) {
5906 WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
5907 sk->sk_state != TCP_FIN_WAIT1);
5908
5909 if (!tcp_check_req(sk, skb, req, true))
5910 goto discard;
5911 }
5912
5913 if (!th->ack && !th->rst && !th->syn)
5914 goto discard;
5915
5916 if (!tcp_validate_incoming(sk, skb, th, 0))
5917 return 0;
5918
5919
5920 acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
5921 FLAG_UPDATE_TS_RECENT |
5922 FLAG_NO_CHALLENGE_ACK) > 0;
5923
5924 if (!acceptable) {
5925 if (sk->sk_state == TCP_SYN_RECV)
5926 return 1;
5927 tcp_send_challenge_ack(sk, skb);
5928 goto discard;
5929 }
5930 switch (sk->sk_state) {
5931 case TCP_SYN_RECV:
5932 if (!tp->srtt_us)
5933 tcp_synack_rtt_meas(sk, req);
5934
5935
5936
5937
5938 if (req) {
5939 inet_csk(sk)->icsk_retransmits = 0;
5940 reqsk_fastopen_remove(sk, req, false);
5941 } else {
5942
5943 icsk->icsk_af_ops->rebuild_header(sk);
5944 tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
5945 tcp_init_congestion_control(sk);
5946
5947 tcp_mtup_init(sk);
5948 tp->copied_seq = tp->rcv_nxt;
5949 tcp_init_buffer_space(sk);
5950 }
5951 smp_mb();
5952 tcp_set_state(sk, TCP_ESTABLISHED);
5953 sk->sk_state_change(sk);
5954
5955
5956
5957
5958
5959 if (sk->sk_socket)
5960 sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
5961
5962 tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
5963 tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
5964 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5965
5966 if (tp->rx_opt.tstamp_ok)
5967 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
5968
5969 if (req) {
5970
5971
5972
5973
5974
5975
5976
5977
5978 tcp_rearm_rto(sk);
5979 } else
5980 tcp_init_metrics(sk);
5981
5982 if (!inet_csk(sk)->icsk_ca_ops->cong_control)
5983 tcp_update_pacing_rate(sk);
5984
5985
5986 tp->lsndtime = tcp_jiffies32;
5987
5988 tcp_initialize_rcv_mss(sk);
5989 tcp_fast_path_on(tp);
5990 break;
5991
5992 case TCP_FIN_WAIT1: {
5993 int tmo;
5994
5995
5996
5997
5998
5999
6000 if (req) {
6001
6002 reqsk_fastopen_remove(sk, req, false);
6003 tcp_rearm_rto(sk);
6004 }
6005 if (tp->snd_una != tp->write_seq)
6006 break;
6007
6008 tcp_set_state(sk, TCP_FIN_WAIT2);
6009 sk->sk_shutdown |= SEND_SHUTDOWN;
6010
6011 sk_dst_confirm(sk);
6012
6013 if (!sock_flag(sk, SOCK_DEAD)) {
6014
6015 sk->sk_state_change(sk);
6016 break;
6017 }
6018
6019 if (tp->linger2 < 0) {
6020 tcp_done(sk);
6021 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
6022 return 1;
6023 }
6024 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
6025 after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
6026
6027 if (tp->syn_fastopen && th->fin)
6028 tcp_fastopen_active_disable(sk);
6029 tcp_done(sk);
6030 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
6031 return 1;
6032 }
6033
6034 tmo = tcp_fin_time(sk);
6035 if (tmo > TCP_TIMEWAIT_LEN) {
6036 inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
6037 } else if (th->fin || sock_owned_by_user(sk)) {
6038
6039
6040
6041
6042
6043
6044 inet_csk_reset_keepalive_timer(sk, tmo);
6045 } else {
6046 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
6047 goto discard;
6048 }
6049 break;
6050 }
6051
6052 case TCP_CLOSING:
6053 if (tp->snd_una == tp->write_seq) {
6054 tcp_time_wait(sk, TCP_TIME_WAIT, 0);
6055 goto discard;
6056 }
6057 break;
6058
6059 case TCP_LAST_ACK:
6060 if (tp->snd_una == tp->write_seq) {
6061 tcp_update_metrics(sk);
6062 tcp_done(sk);
6063 goto discard;
6064 }
6065 break;
6066 }
6067
6068
6069 tcp_urg(sk, skb, th);
6070
6071
6072 switch (sk->sk_state) {
6073 case TCP_CLOSE_WAIT:
6074 case TCP_CLOSING:
6075 case TCP_LAST_ACK:
6076 if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
6077 break;
6078 case TCP_FIN_WAIT1:
6079 case TCP_FIN_WAIT2:
6080
6081
6082
6083
6084 if (sk->sk_shutdown & RCV_SHUTDOWN) {
6085 if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
6086 after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
6087 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
6088 tcp_reset(sk);
6089 return 1;
6090 }
6091 }
6092
6093 case TCP_ESTABLISHED:
6094 tcp_data_queue(sk, skb);
6095 queued = 1;
6096 break;
6097 }
6098
6099
6100 if (sk->sk_state != TCP_CLOSE) {
6101 tcp_data_snd_check(sk);
6102 tcp_ack_snd_check(sk);
6103 }
6104
6105 if (!queued) {
6106discard:
6107 tcp_drop(sk, skb);
6108 }
6109 return 0;
6110}
6111EXPORT_SYMBOL(tcp_rcv_state_process);
6112
6113static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
6114{
6115 struct inet_request_sock *ireq = inet_rsk(req);
6116
6117 if (family == AF_INET)
6118 net_dbg_ratelimited("drop open request from %pI4/%u\n",
6119 &ireq->ir_rmt_addr, port);
6120#if IS_ENABLED(CONFIG_IPV6)
6121 else if (family == AF_INET6)
6122 net_dbg_ratelimited("drop open request from %pI6/%u\n",
6123 &ireq->ir_v6_rmt_addr, port);
6124#endif
6125}
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139static void tcp_ecn_create_request(struct request_sock *req,
6140 const struct sk_buff *skb,
6141 const struct sock *listen_sk,
6142 const struct dst_entry *dst)
6143{
6144 const struct tcphdr *th = tcp_hdr(skb);
6145 const struct net *net = sock_net(listen_sk);
6146 bool th_ecn = th->ece && th->cwr;
6147 bool ect, ecn_ok;
6148 u32 ecn_ok_dst;
6149
6150 if (!th_ecn)
6151 return;
6152
6153 ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
6154 ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
6155 ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
6156
6157 if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
6158 (ecn_ok_dst & DST_FEATURE_ECN_CA) ||
6159 tcp_bpf_ca_needs_ecn((struct sock *)req))
6160 inet_rsk(req)->ecn_ok = 1;
6161}
6162
6163static void tcp_openreq_init(struct request_sock *req,
6164 const struct tcp_options_received *rx_opt,
6165 struct sk_buff *skb, const struct sock *sk)
6166{
6167 struct inet_request_sock *ireq = inet_rsk(req);
6168
6169 req->rsk_rcv_wnd = 0;
6170 req->cookie_ts = 0;
6171 tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
6172 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
6173 tcp_rsk(req)->snt_synack = tcp_clock_us();
6174 tcp_rsk(req)->last_oow_ack_time = 0;
6175 req->mss = rx_opt->mss_clamp;
6176 req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
6177 ireq->tstamp_ok = rx_opt->tstamp_ok;
6178 ireq->sack_ok = rx_opt->sack_ok;
6179 ireq->snd_wscale = rx_opt->snd_wscale;
6180 ireq->wscale_ok = rx_opt->wscale_ok;
6181 ireq->acked = 0;
6182 ireq->ecn_ok = 0;
6183 ireq->ir_rmt_port = tcp_hdr(skb)->source;
6184 ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
6185 ireq->ir_mark = inet_request_mark(sk, skb);
6186}
6187
6188struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
6189 struct sock *sk_listener,
6190 bool attach_listener)
6191{
6192 struct request_sock *req = reqsk_alloc(ops, sk_listener,
6193 attach_listener);
6194
6195 if (req) {
6196 struct inet_request_sock *ireq = inet_rsk(req);
6197
6198 kmemcheck_annotate_bitfield(ireq, flags);
6199 ireq->ireq_opt = NULL;
6200#if IS_ENABLED(CONFIG_IPV6)
6201 ireq->pktopts = NULL;
6202#endif
6203 atomic64_set(&ireq->ir_cookie, 0);
6204 ireq->ireq_state = TCP_NEW_SYN_RECV;
6205 write_pnet(&ireq->ireq_net, sock_net(sk_listener));
6206 ireq->ireq_family = sk_listener->sk_family;
6207 }
6208
6209 return req;
6210}
6211EXPORT_SYMBOL(inet_reqsk_alloc);
6212
6213
6214
6215
6216static bool tcp_syn_flood_action(const struct sock *sk,
6217 const struct sk_buff *skb,
6218 const char *proto)
6219{
6220 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
6221 const char *msg = "Dropping request";
6222 bool want_cookie = false;
6223 struct net *net = sock_net(sk);
6224
6225#ifdef CONFIG_SYN_COOKIES
6226 if (net->ipv4.sysctl_tcp_syncookies) {
6227 msg = "Sending cookies";
6228 want_cookie = true;
6229 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
6230 } else
6231#endif
6232 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
6233
6234 if (!queue->synflood_warned &&
6235 net->ipv4.sysctl_tcp_syncookies != 2 &&
6236 xchg(&queue->synflood_warned, 1) == 0)
6237 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
6238 proto, ntohs(tcp_hdr(skb)->dest), msg);
6239
6240 return want_cookie;
6241}
6242
6243static void tcp_reqsk_record_syn(const struct sock *sk,
6244 struct request_sock *req,
6245 const struct sk_buff *skb)
6246{
6247 if (tcp_sk(sk)->save_syn) {
6248 u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
6249 u32 *copy;
6250
6251 copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
6252 if (copy) {
6253 copy[0] = len;
6254 memcpy(©[1], skb_network_header(skb), len);
6255 req->saved_syn = copy;
6256 }
6257 }
6258}
6259
6260int tcp_conn_request(struct request_sock_ops *rsk_ops,
6261 const struct tcp_request_sock_ops *af_ops,
6262 struct sock *sk, struct sk_buff *skb)
6263{
6264 struct tcp_fastopen_cookie foc = { .len = -1 };
6265 __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
6266 struct tcp_options_received tmp_opt;
6267 struct tcp_sock *tp = tcp_sk(sk);
6268 struct net *net = sock_net(sk);
6269 struct sock *fastopen_sk = NULL;
6270 struct request_sock *req;
6271 bool want_cookie = false;
6272 struct dst_entry *dst;
6273 struct flowi fl;
6274
6275
6276
6277
6278
6279 if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
6280 inet_csk_reqsk_queue_is_full(sk)) && !isn) {
6281 want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
6282 if (!want_cookie)
6283 goto drop;
6284 }
6285
6286 if (sk_acceptq_is_full(sk)) {
6287 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
6288 goto drop;
6289 }
6290
6291 req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
6292 if (!req)
6293 goto drop;
6294
6295 tcp_rsk(req)->af_specific = af_ops;
6296 tcp_rsk(req)->ts_off = 0;
6297
6298 tcp_clear_options(&tmp_opt);
6299 tmp_opt.mss_clamp = af_ops->mss_clamp;
6300 tmp_opt.user_mss = tp->rx_opt.user_mss;
6301 tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
6302 want_cookie ? NULL : &foc);
6303
6304 if (want_cookie && !tmp_opt.saw_tstamp)
6305 tcp_clear_options(&tmp_opt);
6306
6307 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
6308 tcp_openreq_init(req, &tmp_opt, skb, sk);
6309 inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;
6310
6311
6312 inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
6313
6314 af_ops->init_req(req, sk, skb);
6315
6316 if (security_inet_conn_request(sk, skb, req))
6317 goto drop_and_free;
6318
6319 if (tmp_opt.tstamp_ok)
6320 tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
6321
6322 dst = af_ops->route_req(sk, &fl, req);
6323 if (!dst)
6324 goto drop_and_free;
6325
6326 if (!want_cookie && !isn) {
6327
6328 if (!net->ipv4.sysctl_tcp_syncookies &&
6329 (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
6330 (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
6331 !tcp_peer_is_proven(req, dst)) {
6332
6333
6334
6335
6336
6337
6338
6339 pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
6340 rsk_ops->family);
6341 goto drop_and_release;
6342 }
6343
6344 isn = af_ops->init_seq(skb);
6345 }
6346
6347 tcp_ecn_create_request(req, skb, sk, dst);
6348
6349 if (want_cookie) {
6350 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
6351 req->cookie_ts = tmp_opt.tstamp_ok;
6352 if (!tmp_opt.tstamp_ok)
6353 inet_rsk(req)->ecn_ok = 0;
6354 }
6355
6356 tcp_rsk(req)->snt_isn = isn;
6357 tcp_rsk(req)->txhash = net_tx_rndhash();
6358 tcp_openreq_init_rwin(req, sk, dst);
6359 if (!want_cookie) {
6360 tcp_reqsk_record_syn(sk, req, skb);
6361 fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc);
6362 }
6363 if (fastopen_sk) {
6364 af_ops->send_synack(fastopen_sk, dst, &fl, req,
6365 &foc, TCP_SYNACK_FASTOPEN);
6366
6367 inet_csk_reqsk_queue_add(sk, req, fastopen_sk);
6368 sk->sk_data_ready(sk);
6369 bh_unlock_sock(fastopen_sk);
6370 sock_put(fastopen_sk);
6371 } else {
6372 tcp_rsk(req)->tfo_listener = false;
6373 if (!want_cookie)
6374 inet_csk_reqsk_queue_hash_add(sk, req,
6375 tcp_timeout_init((struct sock *)req));
6376 af_ops->send_synack(sk, dst, &fl, req, &foc,
6377 !want_cookie ? TCP_SYNACK_NORMAL :
6378 TCP_SYNACK_COOKIE);
6379 if (want_cookie) {
6380 reqsk_free(req);
6381 return 0;
6382 }
6383 }
6384 reqsk_put(req);
6385 return 0;
6386
6387drop_and_release:
6388 dst_release(dst);
6389drop_and_free:
6390 reqsk_free(req);
6391drop:
6392 tcp_listendrop(sk);
6393 return 0;
6394}
6395EXPORT_SYMBOL(tcp_conn_request);
6396