1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38#define pr_fmt(fmt) "TCP: " fmt
39
40#include <net/tcp.h>
41#include <net/mptcp.h>
42
43#include <linux/compiler.h>
44#include <linux/gfp.h>
45#include <linux/module.h>
46#include <linux/static_key.h>
47
48#include <trace/events/tcp.h>
49
50
51
52
53void tcp_mstamp_refresh(struct tcp_sock *tp)
54{
55 u64 val = tcp_clock_ns();
56
57 tp->tcp_clock_cache = val;
58 tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC);
59}
60
61static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
62 int push_one, gfp_t gfp);
63
64
65static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
66{
67 struct inet_connection_sock *icsk = inet_csk(sk);
68 struct tcp_sock *tp = tcp_sk(sk);
69 unsigned int prior_packets = tp->packets_out;
70
71 WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
72
73 __skb_unlink(skb, &sk->sk_write_queue);
74 tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
75
76 if (tp->highest_sack == NULL)
77 tp->highest_sack = skb;
78
79 tp->packets_out += tcp_skb_pcount(skb);
80 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
81 tcp_rearm_rto(sk);
82
83 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
84 tcp_skb_pcount(skb));
85}
86
87
88
89
90
91
92
93
94static inline __u32 tcp_acceptable_seq(const struct sock *sk)
95{
96 const struct tcp_sock *tp = tcp_sk(sk);
97
98 if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
99 (tp->rx_opt.wscale_ok &&
100 ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
101 return tp->snd_nxt;
102 else
103 return tcp_wnd_end(tp);
104}
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120static __u16 tcp_advertise_mss(struct sock *sk)
121{
122 struct tcp_sock *tp = tcp_sk(sk);
123 const struct dst_entry *dst = __sk_dst_get(sk);
124 int mss = tp->advmss;
125
126 if (dst) {
127 unsigned int metric = dst_metric_advmss(dst);
128
129 if (metric < mss) {
130 mss = metric;
131 tp->advmss = mss;
132 }
133 }
134
135 return (__u16)mss;
136}
137
138
139
140
141void tcp_cwnd_restart(struct sock *sk, s32 delta)
142{
143 struct tcp_sock *tp = tcp_sk(sk);
144 u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
145 u32 cwnd = tp->snd_cwnd;
146
147 tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
148
149 tp->snd_ssthresh = tcp_current_ssthresh(sk);
150 restart_cwnd = min(restart_cwnd, cwnd);
151
152 while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
153 cwnd >>= 1;
154 tp->snd_cwnd = max(cwnd, restart_cwnd);
155 tp->snd_cwnd_stamp = tcp_jiffies32;
156 tp->snd_cwnd_used = 0;
157}
158
159
160static void tcp_event_data_sent(struct tcp_sock *tp,
161 struct sock *sk)
162{
163 struct inet_connection_sock *icsk = inet_csk(sk);
164 const u32 now = tcp_jiffies32;
165
166 if (tcp_packets_in_flight(tp) == 0)
167 tcp_ca_event(sk, CA_EVENT_TX_START);
168
169
170
171
172
173
174 if (before(tp->lsndtime, icsk->icsk_ack.lrcvtime) &&
175 (u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
176 inet_csk_inc_pingpong_cnt(sk);
177
178 tp->lsndtime = now;
179}
180
181
182static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
183 u32 rcv_nxt)
184{
185 struct tcp_sock *tp = tcp_sk(sk);
186
187 if (unlikely(tp->compressed_ack)) {
188 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
189 tp->compressed_ack);
190 tp->compressed_ack = 0;
191 if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
192 __sock_put(sk);
193 }
194
195 if (unlikely(rcv_nxt != tp->rcv_nxt))
196 return;
197 tcp_dec_quickack_mode(sk, pkts);
198 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
199}
200
201
202
203
204
205
206
207
208void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
209 __u32 *rcv_wnd, __u32 *window_clamp,
210 int wscale_ok, __u8 *rcv_wscale,
211 __u32 init_rcv_wnd)
212{
213 unsigned int space = (__space < 0 ? 0 : __space);
214
215
216 if (*window_clamp == 0)
217 (*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
218 space = min(*window_clamp, space);
219
220
221 if (space > mss)
222 space = rounddown(space, mss);
223
224
225
226
227
228
229
230
231
232 if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
233 (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
234 else
235 (*rcv_wnd) = min_t(u32, space, U16_MAX);
236
237 if (init_rcv_wnd)
238 *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
239
240 *rcv_wscale = 0;
241 if (wscale_ok) {
242
243 space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
244 space = max_t(u32, space, sysctl_rmem_max);
245 space = min_t(u32, space, *window_clamp);
246 *rcv_wscale = clamp_t(int, ilog2(space) - 15,
247 0, TCP_MAX_WSCALE);
248 }
249
250 (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
251}
252EXPORT_SYMBOL(tcp_select_initial_window);
253
254
255
256
257
258
259static u16 tcp_select_window(struct sock *sk)
260{
261 struct tcp_sock *tp = tcp_sk(sk);
262 u32 old_win = tp->rcv_wnd;
263 u32 cur_win = tcp_receive_window(tp);
264 u32 new_win = __tcp_select_window(sk);
265
266
267 if (new_win < cur_win) {
268
269
270
271
272
273
274
275 if (new_win == 0)
276 NET_INC_STATS(sock_net(sk),
277 LINUX_MIB_TCPWANTZEROWINDOWADV);
278 new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
279 }
280 tp->rcv_wnd = new_win;
281 tp->rcv_wup = tp->rcv_nxt;
282
283
284
285
286 if (!tp->rx_opt.rcv_wscale &&
287 sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
288 new_win = min(new_win, MAX_TCP_WINDOW);
289 else
290 new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
291
292
293 new_win >>= tp->rx_opt.rcv_wscale;
294
295
296 if (new_win == 0) {
297 tp->pred_flags = 0;
298 if (old_win)
299 NET_INC_STATS(sock_net(sk),
300 LINUX_MIB_TCPTOZEROWINDOWADV);
301 } else if (old_win == 0) {
302 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
303 }
304
305 return new_win;
306}
307
308
309static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
310{
311 const struct tcp_sock *tp = tcp_sk(sk);
312
313 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
314 if (!(tp->ecn_flags & TCP_ECN_OK))
315 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
316 else if (tcp_ca_needs_ecn(sk) ||
317 tcp_bpf_ca_needs_ecn(sk))
318 INET_ECN_xmit(sk);
319}
320
321
322static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
323{
324 struct tcp_sock *tp = tcp_sk(sk);
325 bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
326 bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
327 tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
328
329 if (!use_ecn) {
330 const struct dst_entry *dst = __sk_dst_get(sk);
331
332 if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
333 use_ecn = true;
334 }
335
336 tp->ecn_flags = 0;
337
338 if (use_ecn) {
339 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
340 tp->ecn_flags = TCP_ECN_OK;
341 if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
342 INET_ECN_xmit(sk);
343 }
344}
345
346static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
347{
348 if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)
349
350
351
352 TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
353}
354
355static void
356tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
357{
358 if (inet_rsk(req)->ecn_ok)
359 th->ece = 1;
360}
361
362
363
364
365static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
366 struct tcphdr *th, int tcp_header_len)
367{
368 struct tcp_sock *tp = tcp_sk(sk);
369
370 if (tp->ecn_flags & TCP_ECN_OK) {
371
372 if (skb->len != tcp_header_len &&
373 !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
374 INET_ECN_xmit(sk);
375 if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
376 tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
377 th->cwr = 1;
378 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
379 }
380 } else if (!tcp_ca_needs_ecn(sk)) {
381
382 INET_ECN_dontxmit(sk);
383 }
384 if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
385 th->ece = 1;
386 }
387}
388
389
390
391
392static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
393{
394 skb->ip_summed = CHECKSUM_PARTIAL;
395
396 TCP_SKB_CB(skb)->tcp_flags = flags;
397 TCP_SKB_CB(skb)->sacked = 0;
398
399 tcp_skb_pcount_set(skb, 1);
400
401 TCP_SKB_CB(skb)->seq = seq;
402 if (flags & (TCPHDR_SYN | TCPHDR_FIN))
403 seq++;
404 TCP_SKB_CB(skb)->end_seq = seq;
405}
406
407static inline bool tcp_urg_mode(const struct tcp_sock *tp)
408{
409 return tp->snd_una != tp->snd_up;
410}
411
412#define OPTION_SACK_ADVERTISE (1 << 0)
413#define OPTION_TS (1 << 1)
414#define OPTION_MD5 (1 << 2)
415#define OPTION_WSCALE (1 << 3)
416#define OPTION_FAST_OPEN_COOKIE (1 << 8)
417#define OPTION_SMC (1 << 9)
418#define OPTION_MPTCP (1 << 10)
419
420static void smc_options_write(__be32 *ptr, u16 *options)
421{
422#if IS_ENABLED(CONFIG_SMC)
423 if (static_branch_unlikely(&tcp_have_smc)) {
424 if (unlikely(OPTION_SMC & *options)) {
425 *ptr++ = htonl((TCPOPT_NOP << 24) |
426 (TCPOPT_NOP << 16) |
427 (TCPOPT_EXP << 8) |
428 (TCPOLEN_EXP_SMC_BASE));
429 *ptr++ = htonl(TCPOPT_SMC_MAGIC);
430 }
431 }
432#endif
433}
434
435struct tcp_out_options {
436 u16 options;
437 u16 mss;
438 u8 ws;
439 u8 num_sack_blocks;
440 u8 hash_size;
441 u8 bpf_opt_len;
442 __u8 *hash_location;
443 __u32 tsval, tsecr;
444 struct tcp_fastopen_cookie *fastopen_cookie;
445 struct mptcp_out_options mptcp;
446};
447
448static void mptcp_options_write(__be32 *ptr, const struct tcp_sock *tp,
449 struct tcp_out_options *opts)
450{
451#if IS_ENABLED(CONFIG_MPTCP)
452 if (unlikely(OPTION_MPTCP & opts->options))
453 mptcp_write_options(ptr, tp, &opts->mptcp);
454#endif
455}
456
457#ifdef CONFIG_CGROUP_BPF
458static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb,
459 enum tcp_synack_type synack_type)
460{
461 if (unlikely(!skb))
462 return BPF_WRITE_HDR_TCP_CURRENT_MSS;
463
464 if (unlikely(synack_type == TCP_SYNACK_COOKIE))
465 return BPF_WRITE_HDR_TCP_SYNACK_COOKIE;
466
467 return 0;
468}
469
470
471static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
472 struct request_sock *req,
473 struct sk_buff *syn_skb,
474 enum tcp_synack_type synack_type,
475 struct tcp_out_options *opts,
476 unsigned int *remaining)
477{
478 struct bpf_sock_ops_kern sock_ops;
479 int err;
480
481 if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
482 BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) ||
483 !*remaining)
484 return;
485
486
487
488
489 memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
490
491 sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB;
492
493 if (req) {
494
495
496
497
498
499
500
501
502
503
504
505
506
507 sock_ops.sk = (struct sock *)req;
508 sock_ops.syn_skb = syn_skb;
509 } else {
510 sock_owned_by_me(sk);
511
512 sock_ops.is_fullsock = 1;
513 sock_ops.sk = sk;
514 }
515
516 sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
517 sock_ops.remaining_opt_len = *remaining;
518
519 if (skb)
520 bpf_skops_init_skb(&sock_ops, skb, 0);
521
522 err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
523
524 if (err || sock_ops.remaining_opt_len == *remaining)
525 return;
526
527 opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len;
528
529 opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3;
530
531 *remaining -= opts->bpf_opt_len;
532}
533
534static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
535 struct request_sock *req,
536 struct sk_buff *syn_skb,
537 enum tcp_synack_type synack_type,
538 struct tcp_out_options *opts)
539{
540 u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len;
541 struct bpf_sock_ops_kern sock_ops;
542 int err;
543
544 if (likely(!max_opt_len))
545 return;
546
547 memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
548
549 sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
550
551 if (req) {
552 sock_ops.sk = (struct sock *)req;
553 sock_ops.syn_skb = syn_skb;
554 } else {
555 sock_owned_by_me(sk);
556
557 sock_ops.is_fullsock = 1;
558 sock_ops.sk = sk;
559 }
560
561 sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
562 sock_ops.remaining_opt_len = max_opt_len;
563 first_opt_off = tcp_hdrlen(skb) - max_opt_len;
564 bpf_skops_init_skb(&sock_ops, skb, first_opt_off);
565
566 err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
567
568 if (err)
569 nr_written = 0;
570 else
571 nr_written = max_opt_len - sock_ops.remaining_opt_len;
572
573 if (nr_written < max_opt_len)
574 memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP,
575 max_opt_len - nr_written);
576}
577#else
578static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
579 struct request_sock *req,
580 struct sk_buff *syn_skb,
581 enum tcp_synack_type synack_type,
582 struct tcp_out_options *opts,
583 unsigned int *remaining)
584{
585}
586
587static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
588 struct request_sock *req,
589 struct sk_buff *syn_skb,
590 enum tcp_synack_type synack_type,
591 struct tcp_out_options *opts)
592{
593}
594#endif
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
610 struct tcp_out_options *opts)
611{
612 u16 options = opts->options;
613
614 if (unlikely(OPTION_MD5 & options)) {
615 *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
616 (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
617
618 opts->hash_location = (__u8 *)ptr;
619 ptr += 4;
620 }
621
622 if (unlikely(opts->mss)) {
623 *ptr++ = htonl((TCPOPT_MSS << 24) |
624 (TCPOLEN_MSS << 16) |
625 opts->mss);
626 }
627
628 if (likely(OPTION_TS & options)) {
629 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
630 *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
631 (TCPOLEN_SACK_PERM << 16) |
632 (TCPOPT_TIMESTAMP << 8) |
633 TCPOLEN_TIMESTAMP);
634 options &= ~OPTION_SACK_ADVERTISE;
635 } else {
636 *ptr++ = htonl((TCPOPT_NOP << 24) |
637 (TCPOPT_NOP << 16) |
638 (TCPOPT_TIMESTAMP << 8) |
639 TCPOLEN_TIMESTAMP);
640 }
641 *ptr++ = htonl(opts->tsval);
642 *ptr++ = htonl(opts->tsecr);
643 }
644
645 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
646 *ptr++ = htonl((TCPOPT_NOP << 24) |
647 (TCPOPT_NOP << 16) |
648 (TCPOPT_SACK_PERM << 8) |
649 TCPOLEN_SACK_PERM);
650 }
651
652 if (unlikely(OPTION_WSCALE & options)) {
653 *ptr++ = htonl((TCPOPT_NOP << 24) |
654 (TCPOPT_WINDOW << 16) |
655 (TCPOLEN_WINDOW << 8) |
656 opts->ws);
657 }
658
659 if (unlikely(opts->num_sack_blocks)) {
660 struct tcp_sack_block *sp = tp->rx_opt.dsack ?
661 tp->duplicate_sack : tp->selective_acks;
662 int this_sack;
663
664 *ptr++ = htonl((TCPOPT_NOP << 24) |
665 (TCPOPT_NOP << 16) |
666 (TCPOPT_SACK << 8) |
667 (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
668 TCPOLEN_SACK_PERBLOCK)));
669
670 for (this_sack = 0; this_sack < opts->num_sack_blocks;
671 ++this_sack) {
672 *ptr++ = htonl(sp[this_sack].start_seq);
673 *ptr++ = htonl(sp[this_sack].end_seq);
674 }
675
676 tp->rx_opt.dsack = 0;
677 }
678
679 if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
680 struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
681 u8 *p = (u8 *)ptr;
682 u32 len;
683
684 if (foc->exp) {
685 len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
686 *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
687 TCPOPT_FASTOPEN_MAGIC);
688 p += TCPOLEN_EXP_FASTOPEN_BASE;
689 } else {
690 len = TCPOLEN_FASTOPEN_BASE + foc->len;
691 *p++ = TCPOPT_FASTOPEN;
692 *p++ = len;
693 }
694
695 memcpy(p, foc->val, foc->len);
696 if ((len & 3) == 2) {
697 p[foc->len] = TCPOPT_NOP;
698 p[foc->len + 1] = TCPOPT_NOP;
699 }
700 ptr += (len + 3) >> 2;
701 }
702
703 smc_options_write(ptr, &options);
704
705 mptcp_options_write(ptr, tp, opts);
706}
707
708static void smc_set_option(const struct tcp_sock *tp,
709 struct tcp_out_options *opts,
710 unsigned int *remaining)
711{
712#if IS_ENABLED(CONFIG_SMC)
713 if (static_branch_unlikely(&tcp_have_smc)) {
714 if (tp->syn_smc) {
715 if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
716 opts->options |= OPTION_SMC;
717 *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
718 }
719 }
720 }
721#endif
722}
723
724static void smc_set_option_cond(const struct tcp_sock *tp,
725 const struct inet_request_sock *ireq,
726 struct tcp_out_options *opts,
727 unsigned int *remaining)
728{
729#if IS_ENABLED(CONFIG_SMC)
730 if (static_branch_unlikely(&tcp_have_smc)) {
731 if (tp->syn_smc && ireq->smc_ok) {
732 if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
733 opts->options |= OPTION_SMC;
734 *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
735 }
736 }
737 }
738#endif
739}
740
741static void mptcp_set_option_cond(const struct request_sock *req,
742 struct tcp_out_options *opts,
743 unsigned int *remaining)
744{
745 if (rsk_is_mptcp(req)) {
746 unsigned int size;
747
748 if (mptcp_synack_options(req, &size, &opts->mptcp)) {
749 if (*remaining >= size) {
750 opts->options |= OPTION_MPTCP;
751 *remaining -= size;
752 }
753 }
754 }
755}
756
757
758
759
760static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
761 struct tcp_out_options *opts,
762 struct tcp_md5sig_key **md5)
763{
764 struct tcp_sock *tp = tcp_sk(sk);
765 unsigned int remaining = MAX_TCP_OPTION_SPACE;
766 struct tcp_fastopen_request *fastopen = tp->fastopen_req;
767
768 *md5 = NULL;
769#ifdef CONFIG_TCP_MD5SIG
770 if (static_branch_unlikely(&tcp_md5_needed) &&
771 rcu_access_pointer(tp->md5sig_info)) {
772 *md5 = tp->af_specific->md5_lookup(sk, sk);
773 if (*md5) {
774 opts->options |= OPTION_MD5;
775 remaining -= TCPOLEN_MD5SIG_ALIGNED;
776 }
777 }
778#endif
779
780
781
782
783
784
785
786
787
788
789 opts->mss = tcp_advertise_mss(sk);
790 remaining -= TCPOLEN_MSS_ALIGNED;
791
792 if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) {
793 opts->options |= OPTION_TS;
794 opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
795 opts->tsecr = tp->rx_opt.ts_recent;
796 remaining -= TCPOLEN_TSTAMP_ALIGNED;
797 }
798 if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) {
799 opts->ws = tp->rx_opt.rcv_wscale;
800 opts->options |= OPTION_WSCALE;
801 remaining -= TCPOLEN_WSCALE_ALIGNED;
802 }
803 if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) {
804 opts->options |= OPTION_SACK_ADVERTISE;
805 if (unlikely(!(OPTION_TS & opts->options)))
806 remaining -= TCPOLEN_SACKPERM_ALIGNED;
807 }
808
809 if (fastopen && fastopen->cookie.len >= 0) {
810 u32 need = fastopen->cookie.len;
811
812 need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
813 TCPOLEN_FASTOPEN_BASE;
814 need = (need + 3) & ~3U;
815 if (remaining >= need) {
816 opts->options |= OPTION_FAST_OPEN_COOKIE;
817 opts->fastopen_cookie = &fastopen->cookie;
818 remaining -= need;
819 tp->syn_fastopen = 1;
820 tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
821 }
822 }
823
824 smc_set_option(tp, opts, &remaining);
825
826 if (sk_is_mptcp(sk)) {
827 unsigned int size;
828
829 if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) {
830 opts->options |= OPTION_MPTCP;
831 remaining -= size;
832 }
833 }
834
835 bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
836
837 return MAX_TCP_OPTION_SPACE - remaining;
838}
839
840
841static unsigned int tcp_synack_options(const struct sock *sk,
842 struct request_sock *req,
843 unsigned int mss, struct sk_buff *skb,
844 struct tcp_out_options *opts,
845 const struct tcp_md5sig_key *md5,
846 struct tcp_fastopen_cookie *foc,
847 enum tcp_synack_type synack_type,
848 struct sk_buff *syn_skb)
849{
850 struct inet_request_sock *ireq = inet_rsk(req);
851 unsigned int remaining = MAX_TCP_OPTION_SPACE;
852
853#ifdef CONFIG_TCP_MD5SIG
854 if (md5) {
855 opts->options |= OPTION_MD5;
856 remaining -= TCPOLEN_MD5SIG_ALIGNED;
857
858
859
860
861
862
863 if (synack_type != TCP_SYNACK_COOKIE)
864 ireq->tstamp_ok &= !ireq->sack_ok;
865 }
866#endif
867
868
869 opts->mss = mss;
870 remaining -= TCPOLEN_MSS_ALIGNED;
871
872 if (likely(ireq->wscale_ok)) {
873 opts->ws = ireq->rcv_wscale;
874 opts->options |= OPTION_WSCALE;
875 remaining -= TCPOLEN_WSCALE_ALIGNED;
876 }
877 if (likely(ireq->tstamp_ok)) {
878 opts->options |= OPTION_TS;
879 opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
880 opts->tsecr = req->ts_recent;
881 remaining -= TCPOLEN_TSTAMP_ALIGNED;
882 }
883 if (likely(ireq->sack_ok)) {
884 opts->options |= OPTION_SACK_ADVERTISE;
885 if (unlikely(!ireq->tstamp_ok))
886 remaining -= TCPOLEN_SACKPERM_ALIGNED;
887 }
888 if (foc != NULL && foc->len >= 0) {
889 u32 need = foc->len;
890
891 need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
892 TCPOLEN_FASTOPEN_BASE;
893 need = (need + 3) & ~3U;
894 if (remaining >= need) {
895 opts->options |= OPTION_FAST_OPEN_COOKIE;
896 opts->fastopen_cookie = foc;
897 remaining -= need;
898 }
899 }
900
901 mptcp_set_option_cond(req, opts, &remaining);
902
903 smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
904
905 bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
906 synack_type, opts, &remaining);
907
908 return MAX_TCP_OPTION_SPACE - remaining;
909}
910
911
912
913
914static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
915 struct tcp_out_options *opts,
916 struct tcp_md5sig_key **md5)
917{
918 struct tcp_sock *tp = tcp_sk(sk);
919 unsigned int size = 0;
920 unsigned int eff_sacks;
921
922 opts->options = 0;
923
924 *md5 = NULL;
925#ifdef CONFIG_TCP_MD5SIG
926 if (static_branch_unlikely(&tcp_md5_needed) &&
927 rcu_access_pointer(tp->md5sig_info)) {
928 *md5 = tp->af_specific->md5_lookup(sk, sk);
929 if (*md5) {
930 opts->options |= OPTION_MD5;
931 size += TCPOLEN_MD5SIG_ALIGNED;
932 }
933 }
934#endif
935
936 if (likely(tp->rx_opt.tstamp_ok)) {
937 opts->options |= OPTION_TS;
938 opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
939 opts->tsecr = tp->rx_opt.ts_recent;
940 size += TCPOLEN_TSTAMP_ALIGNED;
941 }
942
943
944
945
946
947
948
949 if (sk_is_mptcp(sk)) {
950 unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
951 unsigned int opt_size = 0;
952
953 if (mptcp_established_options(sk, skb, &opt_size, remaining,
954 &opts->mptcp)) {
955 opts->options |= OPTION_MPTCP;
956 size += opt_size;
957 }
958 }
959
960 eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
961 if (unlikely(eff_sacks)) {
962 const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
963 if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
964 TCPOLEN_SACK_PERBLOCK))
965 return size;
966
967 opts->num_sack_blocks =
968 min_t(unsigned int, eff_sacks,
969 (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
970 TCPOLEN_SACK_PERBLOCK);
971
972 size += TCPOLEN_SACK_BASE_ALIGNED +
973 opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
974 }
975
976 if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
977 BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
978 unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
979
980 bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
981
982 size = MAX_TCP_OPTION_SPACE - remaining;
983 }
984
985 return size;
986}
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003struct tsq_tasklet {
1004 struct tasklet_struct tasklet;
1005 struct list_head head;
1006};
1007static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
1008
1009static void tcp_tsq_write(struct sock *sk)
1010{
1011 if ((1 << sk->sk_state) &
1012 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
1013 TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) {
1014 struct tcp_sock *tp = tcp_sk(sk);
1015
1016 if (tp->lost_out > tp->retrans_out &&
1017 tp->snd_cwnd > tcp_packets_in_flight(tp)) {
1018 tcp_mstamp_refresh(tp);
1019 tcp_xmit_retransmit_queue(sk);
1020 }
1021
1022 tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
1023 0, GFP_ATOMIC);
1024 }
1025}
1026
1027static void tcp_tsq_handler(struct sock *sk)
1028{
1029 bh_lock_sock(sk);
1030 if (!sock_owned_by_user(sk))
1031 tcp_tsq_write(sk);
1032 else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
1033 sock_hold(sk);
1034 bh_unlock_sock(sk);
1035}
1036
1037
1038
1039
1040
1041
1042static void tcp_tasklet_func(struct tasklet_struct *t)
1043{
1044 struct tsq_tasklet *tsq = from_tasklet(tsq, t, tasklet);
1045 LIST_HEAD(list);
1046 unsigned long flags;
1047 struct list_head *q, *n;
1048 struct tcp_sock *tp;
1049 struct sock *sk;
1050
1051 local_irq_save(flags);
1052 list_splice_init(&tsq->head, &list);
1053 local_irq_restore(flags);
1054
1055 list_for_each_safe(q, n, &list) {
1056 tp = list_entry(q, struct tcp_sock, tsq_node);
1057 list_del(&tp->tsq_node);
1058
1059 sk = (struct sock *)tp;
1060 smp_mb__before_atomic();
1061 clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
1062
1063 tcp_tsq_handler(sk);
1064 sk_free(sk);
1065 }
1066}
1067
1068#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
1069 TCPF_WRITE_TIMER_DEFERRED | \
1070 TCPF_DELACK_TIMER_DEFERRED | \
1071 TCPF_MTU_REDUCED_DEFERRED)
1072
1073
1074
1075
1076
1077
1078
1079void tcp_release_cb(struct sock *sk)
1080{
1081 unsigned long flags, nflags;
1082
1083
1084 do {
1085 flags = sk->sk_tsq_flags;
1086 if (!(flags & TCP_DEFERRED_ALL))
1087 return;
1088 nflags = flags & ~TCP_DEFERRED_ALL;
1089 } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
1090
1091 if (flags & TCPF_TSQ_DEFERRED) {
1092 tcp_tsq_write(sk);
1093 __sock_put(sk);
1094 }
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104 sock_release_ownership(sk);
1105
1106 if (flags & TCPF_WRITE_TIMER_DEFERRED) {
1107 tcp_write_timer_handler(sk);
1108 __sock_put(sk);
1109 }
1110 if (flags & TCPF_DELACK_TIMER_DEFERRED) {
1111 tcp_delack_timer_handler(sk);
1112 __sock_put(sk);
1113 }
1114 if (flags & TCPF_MTU_REDUCED_DEFERRED) {
1115 inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
1116 __sock_put(sk);
1117 }
1118}
1119EXPORT_SYMBOL(tcp_release_cb);
1120
1121void __init tcp_tasklet_init(void)
1122{
1123 int i;
1124
1125 for_each_possible_cpu(i) {
1126 struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
1127
1128 INIT_LIST_HEAD(&tsq->head);
1129 tasklet_setup(&tsq->tasklet, tcp_tasklet_func);
1130 }
1131}
1132
1133
1134
1135
1136
1137
1138void tcp_wfree(struct sk_buff *skb)
1139{
1140 struct sock *sk = skb->sk;
1141 struct tcp_sock *tp = tcp_sk(sk);
1142 unsigned long flags, nval, oval;
1143
1144
1145
1146
1147 WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
1148
1149
1150
1151
1152
1153
1154
1155
1156 if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
1157 goto out;
1158
1159 for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
1160 struct tsq_tasklet *tsq;
1161 bool empty;
1162
1163 if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
1164 goto out;
1165
1166 nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
1167 nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
1168 if (nval != oval)
1169 continue;
1170
1171
1172 local_irq_save(flags);
1173 tsq = this_cpu_ptr(&tsq_tasklet);
1174 empty = list_empty(&tsq->head);
1175 list_add(&tp->tsq_node, &tsq->head);
1176 if (empty)
1177 tasklet_schedule(&tsq->tasklet);
1178 local_irq_restore(flags);
1179 return;
1180 }
1181out:
1182 sk_free(sk);
1183}
1184
1185
1186
1187
1188enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
1189{
1190 struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
1191 struct sock *sk = (struct sock *)tp;
1192
1193 tcp_tsq_handler(sk);
1194 sock_put(sk);
1195
1196 return HRTIMER_NORESTART;
1197}
1198
1199static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
1200 u64 prior_wstamp)
1201{
1202 struct tcp_sock *tp = tcp_sk(sk);
1203
1204 if (sk->sk_pacing_status != SK_PACING_NONE) {
1205 unsigned long rate = sk->sk_pacing_rate;
1206
1207
1208
1209
1210
1211 if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
1212 u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
1213 u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
1214
1215
1216 len_ns -= min_t(u64, len_ns / 2, credit);
1217 tp->tcp_wstamp_ns += len_ns;
1218 }
1219 }
1220 list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
1221}
1222
1223INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
1224INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
1225INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb));
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
1239 int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
1240{
1241 const struct inet_connection_sock *icsk = inet_csk(sk);
1242 struct inet_sock *inet;
1243 struct tcp_sock *tp;
1244 struct tcp_skb_cb *tcb;
1245 struct tcp_out_options opts;
1246 unsigned int tcp_options_size, tcp_header_size;
1247 struct sk_buff *oskb = NULL;
1248 struct tcp_md5sig_key *md5;
1249 struct tcphdr *th;
1250 u64 prior_wstamp;
1251 int err;
1252
1253 BUG_ON(!skb || !tcp_skb_pcount(skb));
1254 tp = tcp_sk(sk);
1255 prior_wstamp = tp->tcp_wstamp_ns;
1256 tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
1257 skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
1258 if (clone_it) {
1259 TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
1260 - tp->snd_una;
1261 oskb = skb;
1262
1263 tcp_skb_tsorted_save(oskb) {
1264 if (unlikely(skb_cloned(oskb)))
1265 skb = pskb_copy(oskb, gfp_mask);
1266 else
1267 skb = skb_clone(oskb, gfp_mask);
1268 } tcp_skb_tsorted_restore(oskb);
1269
1270 if (unlikely(!skb))
1271 return -ENOBUFS;
1272
1273
1274
1275 skb->dev = NULL;
1276 }
1277
1278 inet = inet_sk(sk);
1279 tcb = TCP_SKB_CB(skb);
1280 memset(&opts, 0, sizeof(opts));
1281
1282 if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
1283 tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
1284 } else {
1285 tcp_options_size = tcp_established_options(sk, skb, &opts,
1286 &md5);
1287
1288
1289
1290
1291
1292
1293
1294
1295 if (tcp_skb_pcount(skb) > 1)
1296 tcb->tcp_flags |= TCPHDR_PSH;
1297 }
1298 tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
1299
1300
1301
1302
1303
1304
1305
1306
1307 skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
1308
1309
1310
1311
1312
1313
1314 skb->pfmemalloc = 0;
1315
1316 skb_push(skb, tcp_header_size);
1317 skb_reset_transport_header(skb);
1318
1319 skb_orphan(skb);
1320 skb->sk = sk;
1321 skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
1322 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1323
1324 skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
1325
1326
1327 th = (struct tcphdr *)skb->data;
1328 th->source = inet->inet_sport;
1329 th->dest = inet->inet_dport;
1330 th->seq = htonl(tcb->seq);
1331 th->ack_seq = htonl(rcv_nxt);
1332 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
1333 tcb->tcp_flags);
1334
1335 th->check = 0;
1336 th->urg_ptr = 0;
1337
1338
1339 if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
1340 if (before(tp->snd_up, tcb->seq + 0x10000)) {
1341 th->urg_ptr = htons(tp->snd_up - tcb->seq);
1342 th->urg = 1;
1343 } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
1344 th->urg_ptr = htons(0xFFFF);
1345 th->urg = 1;
1346 }
1347 }
1348
1349 skb_shinfo(skb)->gso_type = sk->sk_gso_type;
1350 if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
1351 th->window = htons(tcp_select_window(sk));
1352 tcp_ecn_send(sk, skb, th, tcp_header_size);
1353 } else {
1354
1355
1356
1357 th->window = htons(min(tp->rcv_wnd, 65535U));
1358 }
1359
1360 tcp_options_write((__be32 *)(th + 1), tp, &opts);
1361
1362#ifdef CONFIG_TCP_MD5SIG
1363
1364 if (md5) {
1365 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1366 tp->af_specific->calc_md5_hash(opts.hash_location,
1367 md5, sk, skb);
1368 }
1369#endif
1370
1371
1372 bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
1373
1374 INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
1375 tcp_v6_send_check, tcp_v4_send_check,
1376 sk, skb);
1377
1378 if (likely(tcb->tcp_flags & TCPHDR_ACK))
1379 tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
1380
1381 if (skb->len != tcp_header_size) {
1382 tcp_event_data_sent(tp, sk);
1383 tp->data_segs_out += tcp_skb_pcount(skb);
1384 tp->bytes_sent += skb->len - tcp_header_size;
1385 }
1386
1387 if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
1388 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
1389 tcp_skb_pcount(skb));
1390
1391 tp->segs_out += tcp_skb_pcount(skb);
1392 skb_set_hash_from_sk(skb, sk);
1393
1394 skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
1395 skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
1396
1397
1398
1399
1400 memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
1401 sizeof(struct inet6_skb_parm)));
1402
1403 tcp_add_tx_delay(skb, tp);
1404
1405 err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
1406 inet6_csk_xmit, ip_queue_xmit,
1407 sk, skb, &inet->cork.fl);
1408
1409 if (unlikely(err > 0)) {
1410 tcp_enter_cwr(sk);
1411 err = net_xmit_eval(err);
1412 }
1413 if (!err && oskb) {
1414 tcp_update_skb_after_send(sk, oskb, prior_wstamp);
1415 tcp_rate_skb_sent(sk, oskb);
1416 }
1417 return err;
1418}
1419
1420static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1421 gfp_t gfp_mask)
1422{
1423 return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
1424 tcp_sk(sk)->rcv_nxt);
1425}
1426
1427
1428
1429
1430
1431
1432static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
1433{
1434 struct tcp_sock *tp = tcp_sk(sk);
1435
1436
1437 WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
1438 __skb_header_release(skb);
1439 tcp_add_write_queue_tail(sk, skb);
1440 sk_wmem_queued_add(sk, skb->truesize);
1441 sk_mem_charge(sk, skb->truesize);
1442}
1443
1444
1445static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
1446{
1447 if (skb->len <= mss_now) {
1448
1449
1450
1451 tcp_skb_pcount_set(skb, 1);
1452 TCP_SKB_CB(skb)->tcp_gso_size = 0;
1453 } else {
1454 tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
1455 TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
1456 }
1457}
1458
1459
1460
1461
1462static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
1463{
1464 struct tcp_sock *tp = tcp_sk(sk);
1465
1466 tp->packets_out -= decr;
1467
1468 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1469 tp->sacked_out -= decr;
1470 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
1471 tp->retrans_out -= decr;
1472 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
1473 tp->lost_out -= decr;
1474
1475
1476 if (tcp_is_reno(tp) && decr > 0)
1477 tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
1478
1479 if (tp->lost_skb_hint &&
1480 before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
1481 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
1482 tp->lost_cnt_hint -= decr;
1483
1484 tcp_verify_left_out(tp);
1485}
1486
1487static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
1488{
1489 return TCP_SKB_CB(skb)->txstamp_ack ||
1490 (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
1491}
1492
1493static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
1494{
1495 struct skb_shared_info *shinfo = skb_shinfo(skb);
1496
1497 if (unlikely(tcp_has_tx_tstamp(skb)) &&
1498 !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
1499 struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
1500 u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
1501
1502 shinfo->tx_flags &= ~tsflags;
1503 shinfo2->tx_flags |= tsflags;
1504 swap(shinfo->tskey, shinfo2->tskey);
1505 TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
1506 TCP_SKB_CB(skb)->txstamp_ack = 0;
1507 }
1508}
1509
1510static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
1511{
1512 TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
1513 TCP_SKB_CB(skb)->eor = 0;
1514}
1515
1516
1517static void tcp_insert_write_queue_after(struct sk_buff *skb,
1518 struct sk_buff *buff,
1519 struct sock *sk,
1520 enum tcp_queue tcp_queue)
1521{
1522 if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
1523 __skb_queue_after(&sk->sk_write_queue, skb, buff);
1524 else
1525 tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
1526}
1527
1528
1529
1530
1531
1532
1533int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
1534 struct sk_buff *skb, u32 len,
1535 unsigned int mss_now, gfp_t gfp)
1536{
1537 struct tcp_sock *tp = tcp_sk(sk);
1538 struct sk_buff *buff;
1539 int nsize, old_factor;
1540 long limit;
1541 int nlen;
1542 u8 flags;
1543
1544 if (WARN_ON(len > skb->len))
1545 return -EINVAL;
1546
1547 nsize = skb_headlen(skb) - len;
1548 if (nsize < 0)
1549 nsize = 0;
1550
1551
1552
1553
1554
1555
1556 limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_MAX_SIZE);
1557 if (unlikely((sk->sk_wmem_queued >> 1) > limit &&
1558 tcp_queue != TCP_FRAG_IN_WRITE_QUEUE &&
1559 skb != tcp_rtx_queue_head(sk) &&
1560 skb != tcp_rtx_queue_tail(sk))) {
1561 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
1562 return -ENOMEM;
1563 }
1564
1565 if (skb_unclone(skb, gfp))
1566 return -ENOMEM;
1567
1568
1569 buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
1570 if (!buff)
1571 return -ENOMEM;
1572 skb_copy_decrypted(buff, skb);
1573 mptcp_skb_ext_copy(buff, skb);
1574
1575 sk_wmem_queued_add(sk, buff->truesize);
1576 sk_mem_charge(sk, buff->truesize);
1577 nlen = skb->len - len - nsize;
1578 buff->truesize += nlen;
1579 skb->truesize -= nlen;
1580
1581
1582 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
1583 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
1584 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
1585
1586
1587 flags = TCP_SKB_CB(skb)->tcp_flags;
1588 TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
1589 TCP_SKB_CB(buff)->tcp_flags = flags;
1590 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
1591 tcp_skb_fragment_eor(skb, buff);
1592
1593 skb_split(skb, buff, len);
1594
1595 buff->ip_summed = CHECKSUM_PARTIAL;
1596
1597 buff->tstamp = skb->tstamp;
1598 tcp_fragment_tstamp(skb, buff);
1599
1600 old_factor = tcp_skb_pcount(skb);
1601
1602
1603 tcp_set_skb_tso_segs(skb, mss_now);
1604 tcp_set_skb_tso_segs(buff, mss_now);
1605
1606
1607 TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
1608
1609
1610
1611
1612 if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
1613 int diff = old_factor - tcp_skb_pcount(skb) -
1614 tcp_skb_pcount(buff);
1615
1616 if (diff)
1617 tcp_adjust_pcount(sk, skb, diff);
1618 }
1619
1620
1621 __skb_header_release(buff);
1622 tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
1623 if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
1624 list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
1625
1626 return 0;
1627}
1628
1629
1630
1631
1632static int __pskb_trim_head(struct sk_buff *skb, int len)
1633{
1634 struct skb_shared_info *shinfo;
1635 int i, k, eat;
1636
1637 eat = min_t(int, len, skb_headlen(skb));
1638 if (eat) {
1639 __skb_pull(skb, eat);
1640 len -= eat;
1641 if (!len)
1642 return 0;
1643 }
1644 eat = len;
1645 k = 0;
1646 shinfo = skb_shinfo(skb);
1647 for (i = 0; i < shinfo->nr_frags; i++) {
1648 int size = skb_frag_size(&shinfo->frags[i]);
1649
1650 if (size <= eat) {
1651 skb_frag_unref(skb, i);
1652 eat -= size;
1653 } else {
1654 shinfo->frags[k] = shinfo->frags[i];
1655 if (eat) {
1656 skb_frag_off_add(&shinfo->frags[k], eat);
1657 skb_frag_size_sub(&shinfo->frags[k], eat);
1658 eat = 0;
1659 }
1660 k++;
1661 }
1662 }
1663 shinfo->nr_frags = k;
1664
1665 skb->data_len -= len;
1666 skb->len = skb->data_len;
1667 return len;
1668}
1669
1670
1671int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
1672{
1673 u32 delta_truesize;
1674
1675 if (skb_unclone(skb, GFP_ATOMIC))
1676 return -ENOMEM;
1677
1678 delta_truesize = __pskb_trim_head(skb, len);
1679
1680 TCP_SKB_CB(skb)->seq += len;
1681 skb->ip_summed = CHECKSUM_PARTIAL;
1682
1683 if (delta_truesize) {
1684 skb->truesize -= delta_truesize;
1685 sk_wmem_queued_add(sk, -delta_truesize);
1686 sk_mem_uncharge(sk, delta_truesize);
1687 }
1688
1689
1690 if (tcp_skb_pcount(skb) > 1)
1691 tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
1692
1693 return 0;
1694}
1695
1696
1697static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
1698{
1699 const struct tcp_sock *tp = tcp_sk(sk);
1700 const struct inet_connection_sock *icsk = inet_csk(sk);
1701 int mss_now;
1702
1703
1704
1705
1706 mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
1707
1708
1709 if (icsk->icsk_af_ops->net_frag_header_len) {
1710 const struct dst_entry *dst = __sk_dst_get(sk);
1711
1712 if (dst && dst_allfrag(dst))
1713 mss_now -= icsk->icsk_af_ops->net_frag_header_len;
1714 }
1715
1716
1717 if (mss_now > tp->rx_opt.mss_clamp)
1718 mss_now = tp->rx_opt.mss_clamp;
1719
1720
1721 mss_now -= icsk->icsk_ext_hdr_len;
1722
1723
1724 mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss);
1725 return mss_now;
1726}
1727
1728
1729int tcp_mtu_to_mss(struct sock *sk, int pmtu)
1730{
1731
1732 return __tcp_mtu_to_mss(sk, pmtu) -
1733 (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
1734}
1735
1736
1737int tcp_mss_to_mtu(struct sock *sk, int mss)
1738{
1739 const struct tcp_sock *tp = tcp_sk(sk);
1740 const struct inet_connection_sock *icsk = inet_csk(sk);
1741 int mtu;
1742
1743 mtu = mss +
1744 tp->tcp_header_len +
1745 icsk->icsk_ext_hdr_len +
1746 icsk->icsk_af_ops->net_header_len;
1747
1748
1749 if (icsk->icsk_af_ops->net_frag_header_len) {
1750 const struct dst_entry *dst = __sk_dst_get(sk);
1751
1752 if (dst && dst_allfrag(dst))
1753 mtu += icsk->icsk_af_ops->net_frag_header_len;
1754 }
1755 return mtu;
1756}
1757EXPORT_SYMBOL(tcp_mss_to_mtu);
1758
1759
1760void tcp_mtup_init(struct sock *sk)
1761{
1762 struct tcp_sock *tp = tcp_sk(sk);
1763 struct inet_connection_sock *icsk = inet_csk(sk);
1764 struct net *net = sock_net(sk);
1765
1766 icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
1767 icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
1768 icsk->icsk_af_ops->net_header_len;
1769 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
1770 icsk->icsk_mtup.probe_size = 0;
1771 if (icsk->icsk_mtup.enabled)
1772 icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
1773}
1774EXPORT_SYMBOL(tcp_mtup_init);
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
1799{
1800 struct tcp_sock *tp = tcp_sk(sk);
1801 struct inet_connection_sock *icsk = inet_csk(sk);
1802 int mss_now;
1803
1804 if (icsk->icsk_mtup.search_high > pmtu)
1805 icsk->icsk_mtup.search_high = pmtu;
1806
1807 mss_now = tcp_mtu_to_mss(sk, pmtu);
1808 mss_now = tcp_bound_to_half_wnd(tp, mss_now);
1809
1810
1811 icsk->icsk_pmtu_cookie = pmtu;
1812 if (icsk->icsk_mtup.enabled)
1813 mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
1814 tp->mss_cache = mss_now;
1815
1816 return mss_now;
1817}
1818EXPORT_SYMBOL(tcp_sync_mss);
1819
1820
1821
1822
1823unsigned int tcp_current_mss(struct sock *sk)
1824{
1825 const struct tcp_sock *tp = tcp_sk(sk);
1826 const struct dst_entry *dst = __sk_dst_get(sk);
1827 u32 mss_now;
1828 unsigned int header_len;
1829 struct tcp_out_options opts;
1830 struct tcp_md5sig_key *md5;
1831
1832 mss_now = tp->mss_cache;
1833
1834 if (dst) {
1835 u32 mtu = dst_mtu(dst);
1836 if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
1837 mss_now = tcp_sync_mss(sk, mtu);
1838 }
1839
1840 header_len = tcp_established_options(sk, NULL, &opts, &md5) +
1841 sizeof(struct tcphdr);
1842
1843
1844
1845
1846 if (header_len != tp->tcp_header_len) {
1847 int delta = (int) header_len - tp->tcp_header_len;
1848 mss_now -= delta;
1849 }
1850
1851 return mss_now;
1852}
1853
1854
1855
1856
1857
1858static void tcp_cwnd_application_limited(struct sock *sk)
1859{
1860 struct tcp_sock *tp = tcp_sk(sk);
1861
1862 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
1863 sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
1864
1865 u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
1866 u32 win_used = max(tp->snd_cwnd_used, init_win);
1867 if (win_used < tp->snd_cwnd) {
1868 tp->snd_ssthresh = tcp_current_ssthresh(sk);
1869 tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
1870 }
1871 tp->snd_cwnd_used = 0;
1872 }
1873 tp->snd_cwnd_stamp = tcp_jiffies32;
1874}
1875
1876static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1877{
1878 const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
1879 struct tcp_sock *tp = tcp_sk(sk);
1880
1881
1882
1883
1884 if (!before(tp->snd_una, tp->max_packets_seq) ||
1885 tp->packets_out > tp->max_packets_out ||
1886 is_cwnd_limited) {
1887 tp->max_packets_out = tp->packets_out;
1888 tp->max_packets_seq = tp->snd_nxt;
1889 tp->is_cwnd_limited = is_cwnd_limited;
1890 }
1891
1892 if (tcp_is_cwnd_limited(sk)) {
1893
1894 tp->snd_cwnd_used = 0;
1895 tp->snd_cwnd_stamp = tcp_jiffies32;
1896 } else {
1897
1898 if (tp->packets_out > tp->snd_cwnd_used)
1899 tp->snd_cwnd_used = tp->packets_out;
1900
1901 if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle &&
1902 (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
1903 !ca_ops->cong_control)
1904 tcp_cwnd_application_limited(sk);
1905
1906
1907
1908
1909
1910
1911
1912
1913 if (tcp_write_queue_empty(sk) && sk->sk_socket &&
1914 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
1915 (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1916 tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
1917 }
1918}
1919
1920
1921static bool tcp_minshall_check(const struct tcp_sock *tp)
1922{
1923 return after(tp->snd_sml, tp->snd_una) &&
1924 !after(tp->snd_sml, tp->snd_nxt);
1925}
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
1936 const struct sk_buff *skb)
1937{
1938 if (skb->len < tcp_skb_pcount(skb) * mss_now)
1939 tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
1940}
1941
1942
1943
1944
1945
1946
1947
1948
1949static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
1950 int nonagle)
1951{
1952 return partial &&
1953 ((nonagle & TCP_NAGLE_CORK) ||
1954 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1955}
1956
1957
1958
1959
1960static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
1961 int min_tso_segs)
1962{
1963 u32 bytes, segs;
1964
1965 bytes = min_t(unsigned long,
1966 sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
1967 sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
1968
1969
1970
1971
1972
1973
1974 segs = max_t(u32, bytes / mss_now, min_tso_segs);
1975
1976 return segs;
1977}
1978
1979
1980
1981
1982static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
1983{
1984 const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
1985 u32 min_tso, tso_segs;
1986
1987 min_tso = ca_ops->min_tso_segs ?
1988 ca_ops->min_tso_segs(sk) :
1989 sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs;
1990
1991 tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
1992 return min_t(u32, tso_segs, sk->sk_gso_max_segs);
1993}
1994
1995
1996static unsigned int tcp_mss_split_point(const struct sock *sk,
1997 const struct sk_buff *skb,
1998 unsigned int mss_now,
1999 unsigned int max_segs,
2000 int nonagle)
2001{
2002 const struct tcp_sock *tp = tcp_sk(sk);
2003 u32 partial, needed, window, max_len;
2004
2005 window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
2006 max_len = mss_now * max_segs;
2007
2008 if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
2009 return max_len;
2010
2011 needed = min(skb->len, window);
2012
2013 if (max_len <= needed)
2014 return max_len;
2015
2016 partial = needed % mss_now;
2017
2018
2019
2020
2021 if (tcp_nagle_check(partial != 0, tp, nonagle))
2022 return needed - partial;
2023
2024 return needed;
2025}
2026
2027
2028
2029
2030static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
2031 const struct sk_buff *skb)
2032{
2033 u32 in_flight, cwnd, halfcwnd;
2034
2035
2036 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
2037 tcp_skb_pcount(skb) == 1)
2038 return 1;
2039
2040 in_flight = tcp_packets_in_flight(tp);
2041 cwnd = tp->snd_cwnd;
2042 if (in_flight >= cwnd)
2043 return 0;
2044
2045
2046
2047
2048 halfcwnd = max(cwnd >> 1, 1U);
2049 return min(halfcwnd, cwnd - in_flight);
2050}
2051
2052
2053
2054
2055
2056static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
2057{
2058 int tso_segs = tcp_skb_pcount(skb);
2059
2060 if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
2061 tcp_set_skb_tso_segs(skb, mss_now);
2062 tso_segs = tcp_skb_pcount(skb);
2063 }
2064 return tso_segs;
2065}
2066
2067
2068
2069
2070
2071static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
2072 unsigned int cur_mss, int nonagle)
2073{
2074
2075
2076
2077
2078
2079
2080 if (nonagle & TCP_NAGLE_PUSH)
2081 return true;
2082
2083
2084 if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
2085 return true;
2086
2087 if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
2088 return true;
2089
2090 return false;
2091}
2092
2093
2094static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
2095 const struct sk_buff *skb,
2096 unsigned int cur_mss)
2097{
2098 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
2099
2100 if (skb->len > cur_mss)
2101 end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
2102
2103 return !after(end_seq, tcp_wnd_end(tp));
2104}
2105
2106
2107
2108
2109
2110
2111
2112
2113static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
2114 unsigned int mss_now, gfp_t gfp)
2115{
2116 int nlen = skb->len - len;
2117 struct sk_buff *buff;
2118 u8 flags;
2119
2120
2121 if (skb->len != skb->data_len)
2122 return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
2123 skb, len, mss_now, gfp);
2124
2125 buff = sk_stream_alloc_skb(sk, 0, gfp, true);
2126 if (unlikely(!buff))
2127 return -ENOMEM;
2128 skb_copy_decrypted(buff, skb);
2129 mptcp_skb_ext_copy(buff, skb);
2130
2131 sk_wmem_queued_add(sk, buff->truesize);
2132 sk_mem_charge(sk, buff->truesize);
2133 buff->truesize += nlen;
2134 skb->truesize -= nlen;
2135
2136
2137 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
2138 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
2139 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
2140
2141
2142 flags = TCP_SKB_CB(skb)->tcp_flags;
2143 TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
2144 TCP_SKB_CB(buff)->tcp_flags = flags;
2145
2146
2147 TCP_SKB_CB(buff)->sacked = 0;
2148
2149 tcp_skb_fragment_eor(skb, buff);
2150
2151 buff->ip_summed = CHECKSUM_PARTIAL;
2152 skb_split(skb, buff, len);
2153 tcp_fragment_tstamp(skb, buff);
2154
2155
2156 tcp_set_skb_tso_segs(skb, mss_now);
2157 tcp_set_skb_tso_segs(buff, mss_now);
2158
2159
2160 __skb_header_release(buff);
2161 tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE);
2162
2163 return 0;
2164}
2165
2166
2167
2168
2169
2170
2171static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
2172 bool *is_cwnd_limited,
2173 bool *is_rwnd_limited,
2174 u32 max_segs)
2175{
2176 const struct inet_connection_sock *icsk = inet_csk(sk);
2177 u32 send_win, cong_win, limit, in_flight;
2178 struct tcp_sock *tp = tcp_sk(sk);
2179 struct sk_buff *head;
2180 int win_divisor;
2181 s64 delta;
2182
2183 if (icsk->icsk_ca_state >= TCP_CA_Recovery)
2184 goto send_now;
2185
2186
2187
2188
2189
2190
2191 delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
2192 if (delta > 0)
2193 goto send_now;
2194
2195 in_flight = tcp_packets_in_flight(tp);
2196
2197 BUG_ON(tcp_skb_pcount(skb) <= 1);
2198 BUG_ON(tp->snd_cwnd <= in_flight);
2199
2200 send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
2201
2202
2203 cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
2204
2205 limit = min(send_win, cong_win);
2206
2207
2208 if (limit >= max_segs * tp->mss_cache)
2209 goto send_now;
2210
2211
2212 if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
2213 goto send_now;
2214
2215 win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
2216 if (win_divisor) {
2217 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
2218
2219
2220
2221
2222 chunk /= win_divisor;
2223 if (limit >= chunk)
2224 goto send_now;
2225 } else {
2226
2227
2228
2229
2230
2231 if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
2232 goto send_now;
2233 }
2234
2235
2236 head = tcp_rtx_queue_head(sk);
2237 if (!head)
2238 goto send_now;
2239 delta = tp->tcp_clock_cache - head->tstamp;
2240
2241 if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
2242 goto send_now;
2243
2244
2245
2246
2247
2248
2249
2250 if (cong_win < send_win) {
2251 if (cong_win <= skb->len) {
2252 *is_cwnd_limited = true;
2253 return true;
2254 }
2255 } else {
2256 if (send_win <= skb->len) {
2257 *is_rwnd_limited = true;
2258 return true;
2259 }
2260 }
2261
2262
2263 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
2264 TCP_SKB_CB(skb)->eor)
2265 goto send_now;
2266
2267 return true;
2268
2269send_now:
2270 return false;
2271}
2272
2273static inline void tcp_mtu_check_reprobe(struct sock *sk)
2274{
2275 struct inet_connection_sock *icsk = inet_csk(sk);
2276 struct tcp_sock *tp = tcp_sk(sk);
2277 struct net *net = sock_net(sk);
2278 u32 interval;
2279 s32 delta;
2280
2281 interval = net->ipv4.sysctl_tcp_probe_interval;
2282 delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
2283 if (unlikely(delta >= interval * HZ)) {
2284 int mss = tcp_current_mss(sk);
2285
2286
2287 icsk->icsk_mtup.probe_size = 0;
2288 icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
2289 sizeof(struct tcphdr) +
2290 icsk->icsk_af_ops->net_header_len;
2291 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
2292
2293
2294 icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
2295 }
2296}
2297
2298static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
2299{
2300 struct sk_buff *skb, *next;
2301
2302 skb = tcp_send_head(sk);
2303 tcp_for_write_queue_from_safe(skb, next, sk) {
2304 if (len <= skb->len)
2305 break;
2306
2307 if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb))
2308 return false;
2309
2310 len -= skb->len;
2311 }
2312
2313 return true;
2314}
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325static int tcp_mtu_probe(struct sock *sk)
2326{
2327 struct inet_connection_sock *icsk = inet_csk(sk);
2328 struct tcp_sock *tp = tcp_sk(sk);
2329 struct sk_buff *skb, *nskb, *next;
2330 struct net *net = sock_net(sk);
2331 int probe_size;
2332 int size_needed;
2333 int copy, len;
2334 int mss_now;
2335 int interval;
2336
2337
2338
2339
2340
2341
2342 if (likely(!icsk->icsk_mtup.enabled ||
2343 icsk->icsk_mtup.probe_size ||
2344 inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
2345 tp->snd_cwnd < 11 ||
2346 tp->rx_opt.num_sacks || tp->rx_opt.dsack))
2347 return -1;
2348
2349
2350
2351
2352
2353 mss_now = tcp_current_mss(sk);
2354 probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
2355 icsk->icsk_mtup.search_low) >> 1);
2356 size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
2357 interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
2358
2359
2360
2361
2362 if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
2363 interval < net->ipv4.sysctl_tcp_probe_threshold) {
2364
2365
2366
2367 tcp_mtu_check_reprobe(sk);
2368 return -1;
2369 }
2370
2371
2372 if (tp->write_seq - tp->snd_nxt < size_needed)
2373 return -1;
2374
2375 if (tp->snd_wnd < size_needed)
2376 return -1;
2377 if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
2378 return 0;
2379
2380
2381 if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
2382 if (!tcp_packets_in_flight(tp))
2383 return -1;
2384 else
2385 return 0;
2386 }
2387
2388 if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
2389 return -1;
2390
2391
2392 nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
2393 if (!nskb)
2394 return -1;
2395 sk_wmem_queued_add(sk, nskb->truesize);
2396 sk_mem_charge(sk, nskb->truesize);
2397
2398 skb = tcp_send_head(sk);
2399 skb_copy_decrypted(nskb, skb);
2400 mptcp_skb_ext_copy(nskb, skb);
2401
2402 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
2403 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
2404 TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
2405 TCP_SKB_CB(nskb)->sacked = 0;
2406 nskb->csum = 0;
2407 nskb->ip_summed = CHECKSUM_PARTIAL;
2408
2409 tcp_insert_write_queue_before(nskb, skb, sk);
2410 tcp_highest_sack_replace(sk, skb, nskb);
2411
2412 len = 0;
2413 tcp_for_write_queue_from_safe(skb, next, sk) {
2414 copy = min_t(int, skb->len, probe_size - len);
2415 skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
2416
2417 if (skb->len <= copy) {
2418
2419
2420 TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2421
2422
2423
2424 TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
2425 tcp_skb_collapse_tstamp(nskb, skb);
2426 tcp_unlink_write_queue(skb, sk);
2427 sk_wmem_free_skb(sk, skb);
2428 } else {
2429 TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
2430 ~(TCPHDR_FIN|TCPHDR_PSH);
2431 if (!skb_shinfo(skb)->nr_frags) {
2432 skb_pull(skb, copy);
2433 } else {
2434 __pskb_trim_head(skb, copy);
2435 tcp_set_skb_tso_segs(skb, mss_now);
2436 }
2437 TCP_SKB_CB(skb)->seq += copy;
2438 }
2439
2440 len += copy;
2441
2442 if (len >= probe_size)
2443 break;
2444 }
2445 tcp_init_tso_segs(nskb, nskb->len);
2446
2447
2448
2449
2450 if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
2451
2452
2453 tp->snd_cwnd--;
2454 tcp_event_new_data_sent(sk, nskb);
2455
2456 icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
2457 tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
2458 tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
2459
2460 return 1;
2461 }
2462
2463 return -1;
2464}
2465
2466static bool tcp_pacing_check(struct sock *sk)
2467{
2468 struct tcp_sock *tp = tcp_sk(sk);
2469
2470 if (!tcp_needs_internal_pacing(sk))
2471 return false;
2472
2473 if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
2474 return false;
2475
2476 if (!hrtimer_is_queued(&tp->pacing_timer)) {
2477 hrtimer_start(&tp->pacing_timer,
2478 ns_to_ktime(tp->tcp_wstamp_ns),
2479 HRTIMER_MODE_ABS_PINNED_SOFT);
2480 sock_hold(sk);
2481 }
2482 return true;
2483}
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
2497 unsigned int factor)
2498{
2499 unsigned long limit;
2500
2501 limit = max_t(unsigned long,
2502 2 * skb->truesize,
2503 sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift));
2504 if (sk->sk_pacing_status == SK_PACING_NONE)
2505 limit = min_t(unsigned long, limit,
2506 sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
2507 limit <<= factor;
2508
2509 if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
2510 tcp_sk(sk)->tcp_tx_delay) {
2511 u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay;
2512
2513
2514
2515
2516
2517
2518 extra_bytes >>= (20 - 1);
2519 limit += extra_bytes;
2520 }
2521 if (refcount_read(&sk->sk_wmem_alloc) > limit) {
2522
2523
2524
2525
2526
2527 if (tcp_rtx_queue_empty(sk))
2528 return false;
2529
2530 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
2531
2532
2533
2534
2535 smp_mb__after_atomic();
2536 if (refcount_read(&sk->sk_wmem_alloc) > limit)
2537 return true;
2538 }
2539 return false;
2540}
2541
2542static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
2543{
2544 const u32 now = tcp_jiffies32;
2545 enum tcp_chrono old = tp->chrono_type;
2546
2547 if (old > TCP_CHRONO_UNSPEC)
2548 tp->chrono_stat[old - 1] += now - tp->chrono_start;
2549 tp->chrono_start = now;
2550 tp->chrono_type = new;
2551}
2552
2553void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
2554{
2555 struct tcp_sock *tp = tcp_sk(sk);
2556
2557
2558
2559
2560
2561
2562 if (type > tp->chrono_type)
2563 tcp_chrono_set(tp, type);
2564}
2565
2566void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
2567{
2568 struct tcp_sock *tp = tcp_sk(sk);
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578 if (tcp_rtx_and_write_queues_empty(sk))
2579 tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
2580 else if (type == tp->chrono_type)
2581 tcp_chrono_set(tp, TCP_CHRONO_BUSY);
2582}
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2599 int push_one, gfp_t gfp)
2600{
2601 struct tcp_sock *tp = tcp_sk(sk);
2602 struct sk_buff *skb;
2603 unsigned int tso_segs, sent_pkts;
2604 int cwnd_quota;
2605 int result;
2606 bool is_cwnd_limited = false, is_rwnd_limited = false;
2607 u32 max_segs;
2608
2609 sent_pkts = 0;
2610
2611 tcp_mstamp_refresh(tp);
2612 if (!push_one) {
2613
2614 result = tcp_mtu_probe(sk);
2615 if (!result) {
2616 return false;
2617 } else if (result > 0) {
2618 sent_pkts = 1;
2619 }
2620 }
2621
2622 max_segs = tcp_tso_segs(sk, mss_now);
2623 while ((skb = tcp_send_head(sk))) {
2624 unsigned int limit;
2625
2626 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
2627
2628 skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
2629 list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
2630 tcp_init_tso_segs(skb, mss_now);
2631 goto repair;
2632 }
2633
2634 if (tcp_pacing_check(sk))
2635 break;
2636
2637 tso_segs = tcp_init_tso_segs(skb, mss_now);
2638 BUG_ON(!tso_segs);
2639
2640 cwnd_quota = tcp_cwnd_test(tp, skb);
2641 if (!cwnd_quota) {
2642 if (push_one == 2)
2643
2644 cwnd_quota = 1;
2645 else
2646 break;
2647 }
2648
2649 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
2650 is_rwnd_limited = true;
2651 break;
2652 }
2653
2654 if (tso_segs == 1) {
2655 if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
2656 (tcp_skb_is_last(sk, skb) ?
2657 nonagle : TCP_NAGLE_PUSH))))
2658 break;
2659 } else {
2660 if (!push_one &&
2661 tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
2662 &is_rwnd_limited, max_segs))
2663 break;
2664 }
2665
2666 limit = mss_now;
2667 if (tso_segs > 1 && !tcp_urg_mode(tp))
2668 limit = tcp_mss_split_point(sk, skb, mss_now,
2669 min_t(unsigned int,
2670 cwnd_quota,
2671 max_segs),
2672 nonagle);
2673
2674 if (skb->len > limit &&
2675 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
2676 break;
2677
2678 if (tcp_small_queue_check(sk, skb, 0))
2679 break;
2680
2681
2682
2683
2684
2685
2686 if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
2687 break;
2688
2689 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
2690 break;
2691
2692repair:
2693
2694
2695
2696 tcp_event_new_data_sent(sk, skb);
2697
2698 tcp_minshall_update(tp, mss_now, skb);
2699 sent_pkts += tcp_skb_pcount(skb);
2700
2701 if (push_one)
2702 break;
2703 }
2704
2705 if (is_rwnd_limited)
2706 tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
2707 else
2708 tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
2709
2710 is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
2711 if (likely(sent_pkts || is_cwnd_limited))
2712 tcp_cwnd_validate(sk, is_cwnd_limited);
2713
2714 if (likely(sent_pkts)) {
2715 if (tcp_in_cwnd_reduction(sk))
2716 tp->prr_out += sent_pkts;
2717
2718
2719 if (push_one != 2)
2720 tcp_schedule_loss_probe(sk, false);
2721 return false;
2722 }
2723 return !tp->packets_out && !tcp_write_queue_empty(sk);
2724}
2725
2726bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
2727{
2728 struct inet_connection_sock *icsk = inet_csk(sk);
2729 struct tcp_sock *tp = tcp_sk(sk);
2730 u32 timeout, rto_delta_us;
2731 int early_retrans;
2732
2733
2734
2735
2736 if (rcu_access_pointer(tp->fastopen_rsk))
2737 return false;
2738
2739 early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
2740
2741
2742
2743 if ((early_retrans != 3 && early_retrans != 4) ||
2744 !tp->packets_out || !tcp_is_sack(tp) ||
2745 (icsk->icsk_ca_state != TCP_CA_Open &&
2746 icsk->icsk_ca_state != TCP_CA_CWR))
2747 return false;
2748
2749
2750
2751
2752
2753 if (tp->srtt_us) {
2754 timeout = usecs_to_jiffies(tp->srtt_us >> 2);
2755 if (tp->packets_out == 1)
2756 timeout += TCP_RTO_MIN;
2757 else
2758 timeout += TCP_TIMEOUT_MIN;
2759 } else {
2760 timeout = TCP_TIMEOUT_INIT;
2761 }
2762
2763
2764 rto_delta_us = advancing_rto ?
2765 jiffies_to_usecs(inet_csk(sk)->icsk_rto) :
2766 tcp_rto_delta_us(sk);
2767 if (rto_delta_us > 0)
2768 timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
2769
2770 tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX);
2771 return true;
2772}
2773
2774
2775
2776
2777
2778static bool skb_still_in_host_queue(const struct sock *sk,
2779 const struct sk_buff *skb)
2780{
2781 if (unlikely(skb_fclone_busy(sk, skb))) {
2782 NET_INC_STATS(sock_net(sk),
2783 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
2784 return true;
2785 }
2786 return false;
2787}
2788
2789
2790
2791
2792void tcp_send_loss_probe(struct sock *sk)
2793{
2794 struct tcp_sock *tp = tcp_sk(sk);
2795 struct sk_buff *skb;
2796 int pcount;
2797 int mss = tcp_current_mss(sk);
2798
2799
2800 if (tp->tlp_high_seq)
2801 goto rearm_timer;
2802
2803 tp->tlp_retrans = 0;
2804 skb = tcp_send_head(sk);
2805 if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
2806 pcount = tp->packets_out;
2807 tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
2808 if (tp->packets_out > pcount)
2809 goto probe_sent;
2810 goto rearm_timer;
2811 }
2812 skb = skb_rb_last(&sk->tcp_rtx_queue);
2813 if (unlikely(!skb)) {
2814 WARN_ONCE(tp->packets_out,
2815 "invalid inflight: %u state %u cwnd %u mss %d\n",
2816 tp->packets_out, sk->sk_state, tp->snd_cwnd, mss);
2817 inet_csk(sk)->icsk_pending = 0;
2818 return;
2819 }
2820
2821 if (skb_still_in_host_queue(sk, skb))
2822 goto rearm_timer;
2823
2824 pcount = tcp_skb_pcount(skb);
2825 if (WARN_ON(!pcount))
2826 goto rearm_timer;
2827
2828 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
2829 if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
2830 (pcount - 1) * mss, mss,
2831 GFP_ATOMIC)))
2832 goto rearm_timer;
2833 skb = skb_rb_next(skb);
2834 }
2835
2836 if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
2837 goto rearm_timer;
2838
2839 if (__tcp_retransmit_skb(sk, skb, 1))
2840 goto rearm_timer;
2841
2842 tp->tlp_retrans = 1;
2843
2844probe_sent:
2845
2846 tp->tlp_high_seq = tp->snd_nxt;
2847
2848 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
2849
2850 inet_csk(sk)->icsk_pending = 0;
2851rearm_timer:
2852 tcp_rearm_rto(sk);
2853}
2854
2855
2856
2857
2858
2859void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
2860 int nonagle)
2861{
2862
2863
2864
2865
2866 if (unlikely(sk->sk_state == TCP_CLOSE))
2867 return;
2868
2869 if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
2870 sk_gfp_mask(sk, GFP_ATOMIC)))
2871 tcp_check_probe_timer(sk);
2872}
2873
2874
2875
2876
2877void tcp_push_one(struct sock *sk, unsigned int mss_now)
2878{
2879 struct sk_buff *skb = tcp_send_head(sk);
2880
2881 BUG_ON(!skb || skb->len < mss_now);
2882
2883 tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
2884}
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938u32 __tcp_select_window(struct sock *sk)
2939{
2940 struct inet_connection_sock *icsk = inet_csk(sk);
2941 struct tcp_sock *tp = tcp_sk(sk);
2942
2943
2944
2945
2946
2947
2948 int mss = icsk->icsk_ack.rcv_mss;
2949 int free_space = tcp_space(sk);
2950 int allowed_space = tcp_full_space(sk);
2951 int full_space, window;
2952
2953 if (sk_is_mptcp(sk))
2954 mptcp_space(sk, &free_space, &allowed_space);
2955
2956 full_space = min_t(int, tp->window_clamp, allowed_space);
2957
2958 if (unlikely(mss > full_space)) {
2959 mss = full_space;
2960 if (mss <= 0)
2961 return 0;
2962 }
2963 if (free_space < (full_space >> 1)) {
2964 icsk->icsk_ack.quick = 0;
2965
2966 if (tcp_under_memory_pressure(sk))
2967 tp->rcv_ssthresh = min(tp->rcv_ssthresh,
2968 4U * tp->advmss);
2969
2970
2971
2972
2973 free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
2974
2975
2976
2977
2978
2979
2980
2981
2982 if (free_space < (allowed_space >> 4) || free_space < mss)
2983 return 0;
2984 }
2985
2986 if (free_space > tp->rcv_ssthresh)
2987 free_space = tp->rcv_ssthresh;
2988
2989
2990
2991
2992 if (tp->rx_opt.rcv_wscale) {
2993 window = free_space;
2994
2995
2996
2997
2998
2999 window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
3000 } else {
3001 window = tp->rcv_wnd;
3002
3003
3004
3005
3006
3007
3008
3009
3010 if (window <= free_space - mss || window > free_space)
3011 window = rounddown(free_space, mss);
3012 else if (mss == full_space &&
3013 free_space > window + (full_space >> 1))
3014 window = free_space;
3015 }
3016
3017 return window;
3018}
3019
3020void tcp_skb_collapse_tstamp(struct sk_buff *skb,
3021 const struct sk_buff *next_skb)
3022{
3023 if (unlikely(tcp_has_tx_tstamp(next_skb))) {
3024 const struct skb_shared_info *next_shinfo =
3025 skb_shinfo(next_skb);
3026 struct skb_shared_info *shinfo = skb_shinfo(skb);
3027
3028 shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
3029 shinfo->tskey = next_shinfo->tskey;
3030 TCP_SKB_CB(skb)->txstamp_ack |=
3031 TCP_SKB_CB(next_skb)->txstamp_ack;
3032 }
3033}
3034
3035
3036static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
3037{
3038 struct tcp_sock *tp = tcp_sk(sk);
3039 struct sk_buff *next_skb = skb_rb_next(skb);
3040 int next_skb_size;
3041
3042 next_skb_size = next_skb->len;
3043
3044 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
3045
3046 if (next_skb_size) {
3047 if (next_skb_size <= skb_availroom(skb))
3048 skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
3049 next_skb_size);
3050 else if (!tcp_skb_shift(skb, next_skb, 1, next_skb_size))
3051 return false;
3052 }
3053 tcp_highest_sack_replace(sk, next_skb, skb);
3054
3055
3056 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
3057
3058
3059 TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
3060
3061
3062
3063
3064 TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
3065 TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
3066
3067
3068 tcp_clear_retrans_hints_partial(tp);
3069 if (next_skb == tp->retransmit_skb_hint)
3070 tp->retransmit_skb_hint = skb;
3071
3072 tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
3073
3074 tcp_skb_collapse_tstamp(skb, next_skb);
3075
3076 tcp_rtx_queue_unlink_and_free(next_skb, sk);
3077 return true;
3078}
3079
3080
3081static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
3082{
3083 if (tcp_skb_pcount(skb) > 1)
3084 return false;
3085 if (skb_cloned(skb))
3086 return false;
3087
3088 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
3089 return false;
3090
3091 return true;
3092}
3093
3094
3095
3096
3097static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
3098 int space)
3099{
3100 struct tcp_sock *tp = tcp_sk(sk);
3101 struct sk_buff *skb = to, *tmp;
3102 bool first = true;
3103
3104 if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)
3105 return;
3106 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
3107 return;
3108
3109 skb_rbtree_walk_from_safe(skb, tmp) {
3110 if (!tcp_can_collapse(sk, skb))
3111 break;
3112
3113 if (!tcp_skb_can_collapse(to, skb))
3114 break;
3115
3116 space -= skb->len;
3117
3118 if (first) {
3119 first = false;
3120 continue;
3121 }
3122
3123 if (space < 0)
3124 break;
3125
3126 if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
3127 break;
3128
3129 if (!tcp_collapse_retrans(sk, to))
3130 break;
3131 }
3132}
3133
3134
3135
3136
3137
3138int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
3139{
3140 struct inet_connection_sock *icsk = inet_csk(sk);
3141 struct tcp_sock *tp = tcp_sk(sk);
3142 unsigned int cur_mss;
3143 int diff, len, err;
3144
3145
3146
3147 if (icsk->icsk_mtup.probe_size)
3148 icsk->icsk_mtup.probe_size = 0;
3149
3150
3151
3152
3153 if (refcount_read(&sk->sk_wmem_alloc) >
3154 min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2),
3155 sk->sk_sndbuf))
3156 return -EAGAIN;
3157
3158 if (skb_still_in_host_queue(sk, skb))
3159 return -EBUSY;
3160
3161 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
3162 if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
3163 WARN_ON_ONCE(1);
3164 return -EINVAL;
3165 }
3166 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
3167 return -ENOMEM;
3168 }
3169
3170 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
3171 return -EHOSTUNREACH;
3172
3173 cur_mss = tcp_current_mss(sk);
3174
3175
3176
3177
3178
3179
3180 if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
3181 TCP_SKB_CB(skb)->seq != tp->snd_una)
3182 return -EAGAIN;
3183
3184 len = cur_mss * segs;
3185 if (skb->len > len) {
3186 if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
3187 cur_mss, GFP_ATOMIC))
3188 return -ENOMEM;
3189 } else {
3190 if (skb_unclone(skb, GFP_ATOMIC))
3191 return -ENOMEM;
3192
3193 diff = tcp_skb_pcount(skb);
3194 tcp_set_skb_tso_segs(skb, cur_mss);
3195 diff -= tcp_skb_pcount(skb);
3196 if (diff)
3197 tcp_adjust_pcount(sk, skb, diff);
3198 if (skb->len < cur_mss)
3199 tcp_retrans_try_collapse(sk, skb, cur_mss);
3200 }
3201
3202
3203 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
3204 tcp_ecn_clear_syn(sk, skb);
3205
3206
3207 segs = tcp_skb_pcount(skb);
3208 TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
3209 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
3210 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
3211 tp->total_retrans += segs;
3212 tp->bytes_retrans += skb->len;
3213
3214
3215
3216
3217
3218 if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
3219 skb_headroom(skb) >= 0xFFFF)) {
3220 struct sk_buff *nskb;
3221
3222 tcp_skb_tsorted_save(skb) {
3223 nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
3224 if (nskb) {
3225 nskb->dev = NULL;
3226 err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC);
3227 } else {
3228 err = -ENOBUFS;
3229 }
3230 } tcp_skb_tsorted_restore(skb);
3231
3232 if (!err) {
3233 tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
3234 tcp_rate_skb_sent(sk, skb);
3235 }
3236 } else {
3237 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
3238 }
3239
3240
3241
3242
3243 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
3244
3245 if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
3246 tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
3247 TCP_SKB_CB(skb)->seq, segs, err);
3248
3249 if (likely(!err)) {
3250 trace_tcp_retransmit_skb(sk, skb);
3251 } else if (err != -EBUSY) {
3252 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
3253 }
3254 return err;
3255}
3256
3257int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
3258{
3259 struct tcp_sock *tp = tcp_sk(sk);
3260 int err = __tcp_retransmit_skb(sk, skb, segs);
3261
3262 if (err == 0) {
3263#if FASTRETRANS_DEBUG > 0
3264 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
3265 net_dbg_ratelimited("retrans_out leaked\n");
3266 }
3267#endif
3268 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
3269 tp->retrans_out += tcp_skb_pcount(skb);
3270 }
3271
3272
3273 if (!tp->retrans_stamp)
3274 tp->retrans_stamp = tcp_skb_timestamp(skb);
3275
3276 if (tp->undo_retrans < 0)
3277 tp->undo_retrans = 0;
3278 tp->undo_retrans += tcp_skb_pcount(skb);
3279 return err;
3280}
3281
3282
3283
3284
3285
3286
3287void tcp_xmit_retransmit_queue(struct sock *sk)
3288{
3289 const struct inet_connection_sock *icsk = inet_csk(sk);
3290 struct sk_buff *skb, *rtx_head, *hole = NULL;
3291 struct tcp_sock *tp = tcp_sk(sk);
3292 bool rearm_timer = false;
3293 u32 max_segs;
3294 int mib_idx;
3295
3296 if (!tp->packets_out)
3297 return;
3298
3299 rtx_head = tcp_rtx_queue_head(sk);
3300 skb = tp->retransmit_skb_hint ?: rtx_head;
3301 max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
3302 skb_rbtree_walk_from(skb) {
3303 __u8 sacked;
3304 int segs;
3305
3306 if (tcp_pacing_check(sk))
3307 break;
3308
3309
3310 if (!hole)
3311 tp->retransmit_skb_hint = skb;
3312
3313 segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
3314 if (segs <= 0)
3315 break;
3316 sacked = TCP_SKB_CB(skb)->sacked;
3317
3318
3319
3320 segs = min_t(int, segs, max_segs);
3321
3322 if (tp->retrans_out >= tp->lost_out) {
3323 break;
3324 } else if (!(sacked & TCPCB_LOST)) {
3325 if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
3326 hole = skb;
3327 continue;
3328
3329 } else {
3330 if (icsk->icsk_ca_state != TCP_CA_Loss)
3331 mib_idx = LINUX_MIB_TCPFASTRETRANS;
3332 else
3333 mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
3334 }
3335
3336 if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
3337 continue;
3338
3339 if (tcp_small_queue_check(sk, skb, 1))
3340 break;
3341
3342 if (tcp_retransmit_skb(sk, skb, segs))
3343 break;
3344
3345 NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
3346
3347 if (tcp_in_cwnd_reduction(sk))
3348 tp->prr_out += tcp_skb_pcount(skb);
3349
3350 if (skb == rtx_head &&
3351 icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
3352 rearm_timer = true;
3353
3354 }
3355 if (rearm_timer)
3356 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
3357 inet_csk(sk)->icsk_rto,
3358 TCP_RTO_MAX);
3359}
3360
3361
3362
3363
3364
3365
3366
3367
3368void sk_forced_mem_schedule(struct sock *sk, int size)
3369{
3370 int amt;
3371
3372 if (size <= sk->sk_forward_alloc)
3373 return;
3374 amt = sk_mem_pages(size);
3375 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
3376 sk_memory_allocated_add(sk, amt);
3377
3378 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3379 mem_cgroup_charge_skmem(sk->sk_memcg, amt);
3380}
3381
3382
3383
3384
3385void tcp_send_fin(struct sock *sk)
3386{
3387 struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk);
3388 struct tcp_sock *tp = tcp_sk(sk);
3389
3390
3391
3392
3393
3394
3395 tskb = tail;
3396 if (!tskb && tcp_under_memory_pressure(sk))
3397 tskb = skb_rb_last(&sk->tcp_rtx_queue);
3398
3399 if (tskb) {
3400 TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
3401 TCP_SKB_CB(tskb)->end_seq++;
3402 tp->write_seq++;
3403 if (!tail) {
3404
3405
3406
3407
3408
3409
3410 WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1);
3411 return;
3412 }
3413 } else {
3414 skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
3415 if (unlikely(!skb))
3416 return;
3417
3418 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
3419 skb_reserve(skb, MAX_TCP_HEADER);
3420 sk_forced_mem_schedule(sk, skb->truesize);
3421
3422 tcp_init_nondata_skb(skb, tp->write_seq,
3423 TCPHDR_ACK | TCPHDR_FIN);
3424 tcp_queue_skb(sk, skb);
3425 }
3426 __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
3427}
3428
3429
3430
3431
3432
3433
3434void tcp_send_active_reset(struct sock *sk, gfp_t priority)
3435{
3436 struct sk_buff *skb;
3437
3438 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
3439
3440
3441 skb = alloc_skb(MAX_TCP_HEADER, priority);
3442 if (!skb) {
3443 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
3444 return;
3445 }
3446
3447
3448 skb_reserve(skb, MAX_TCP_HEADER);
3449 tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
3450 TCPHDR_ACK | TCPHDR_RST);
3451 tcp_mstamp_refresh(tcp_sk(sk));
3452
3453 if (tcp_transmit_skb(sk, skb, 0, priority))
3454 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
3455
3456
3457
3458
3459 trace_tcp_send_reset(sk, NULL);
3460}
3461
3462
3463
3464
3465
3466
3467
3468int tcp_send_synack(struct sock *sk)
3469{
3470 struct sk_buff *skb;
3471
3472 skb = tcp_rtx_queue_head(sk);
3473 if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
3474 pr_err("%s: wrong queue state\n", __func__);
3475 return -EFAULT;
3476 }
3477 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
3478 if (skb_cloned(skb)) {
3479 struct sk_buff *nskb;
3480
3481 tcp_skb_tsorted_save(skb) {
3482 nskb = skb_copy(skb, GFP_ATOMIC);
3483 } tcp_skb_tsorted_restore(skb);
3484 if (!nskb)
3485 return -ENOMEM;
3486 INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
3487 tcp_highest_sack_replace(sk, skb, nskb);
3488 tcp_rtx_queue_unlink_and_free(skb, sk);
3489 __skb_header_release(nskb);
3490 tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
3491 sk_wmem_queued_add(sk, nskb->truesize);
3492 sk_mem_charge(sk, nskb->truesize);
3493 skb = nskb;
3494 }
3495
3496 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
3497 tcp_ecn_send_synack(sk, skb);
3498 }
3499 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
3500}
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
3513 struct request_sock *req,
3514 struct tcp_fastopen_cookie *foc,
3515 enum tcp_synack_type synack_type,
3516 struct sk_buff *syn_skb)
3517{
3518 struct inet_request_sock *ireq = inet_rsk(req);
3519 const struct tcp_sock *tp = tcp_sk(sk);
3520 struct tcp_md5sig_key *md5 = NULL;
3521 struct tcp_out_options opts;
3522 struct sk_buff *skb;
3523 int tcp_header_size;
3524 struct tcphdr *th;
3525 int mss;
3526 u64 now;
3527
3528 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
3529 if (unlikely(!skb)) {
3530 dst_release(dst);
3531 return NULL;
3532 }
3533
3534 skb_reserve(skb, MAX_TCP_HEADER);
3535
3536 switch (synack_type) {
3537 case TCP_SYNACK_NORMAL:
3538 skb_set_owner_w(skb, req_to_sk(req));
3539 break;
3540 case TCP_SYNACK_COOKIE:
3541
3542
3543
3544 break;
3545 case TCP_SYNACK_FASTOPEN:
3546
3547
3548
3549
3550 skb_set_owner_w(skb, (struct sock *)sk);
3551 break;
3552 }
3553 skb_dst_set(skb, dst);
3554
3555 mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
3556
3557 memset(&opts, 0, sizeof(opts));
3558 now = tcp_clock_ns();
3559#ifdef CONFIG_SYN_COOKIES
3560 if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
3561 skb->skb_mstamp_ns = cookie_init_timestamp(req, now);
3562 else
3563#endif
3564 {
3565 skb->skb_mstamp_ns = now;
3566 if (!tcp_rsk(req)->snt_synack)
3567 tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
3568 }
3569
3570#ifdef CONFIG_TCP_MD5SIG
3571 rcu_read_lock();
3572 md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
3573#endif
3574 skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
3575
3576 TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK;
3577 tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
3578 foc, synack_type,
3579 syn_skb) + sizeof(*th);
3580
3581 skb_push(skb, tcp_header_size);
3582 skb_reset_transport_header(skb);
3583
3584 th = (struct tcphdr *)skb->data;
3585 memset(th, 0, sizeof(struct tcphdr));
3586 th->syn = 1;
3587 th->ack = 1;
3588 tcp_ecn_make_synack(req, th);
3589 th->source = htons(ireq->ir_num);
3590 th->dest = ireq->ir_rmt_port;
3591 skb->mark = ireq->ir_mark;
3592 skb->ip_summed = CHECKSUM_PARTIAL;
3593 th->seq = htonl(tcp_rsk(req)->snt_isn);
3594
3595 th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
3596
3597
3598 th->window = htons(min(req->rsk_rcv_wnd, 65535U));
3599 tcp_options_write((__be32 *)(th + 1), NULL, &opts);
3600 th->doff = (tcp_header_size >> 2);
3601 __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
3602
3603#ifdef CONFIG_TCP_MD5SIG
3604
3605 if (md5)
3606 tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
3607 md5, req_to_sk(req), skb);
3608 rcu_read_unlock();
3609#endif
3610
3611 bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
3612 synack_type, &opts);
3613
3614 skb->skb_mstamp_ns = now;
3615 tcp_add_tx_delay(skb, tp);
3616
3617 return skb;
3618}
3619EXPORT_SYMBOL(tcp_make_synack);
3620
3621static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
3622{
3623 struct inet_connection_sock *icsk = inet_csk(sk);
3624 const struct tcp_congestion_ops *ca;
3625 u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
3626
3627 if (ca_key == TCP_CA_UNSPEC)
3628 return;
3629
3630 rcu_read_lock();
3631 ca = tcp_ca_find_key(ca_key);
3632 if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
3633 bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
3634 icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
3635 icsk->icsk_ca_ops = ca;
3636 }
3637 rcu_read_unlock();
3638}
3639
3640
3641static void tcp_connect_init(struct sock *sk)
3642{
3643 const struct dst_entry *dst = __sk_dst_get(sk);
3644 struct tcp_sock *tp = tcp_sk(sk);
3645 __u8 rcv_wscale;
3646 u32 rcv_wnd;
3647
3648
3649
3650
3651 tp->tcp_header_len = sizeof(struct tcphdr);
3652 if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
3653 tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
3654
3655#ifdef CONFIG_TCP_MD5SIG
3656 if (tp->af_specific->md5_lookup(sk, sk))
3657 tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
3658#endif
3659
3660
3661 if (tp->rx_opt.user_mss)
3662 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
3663 tp->max_window = 0;
3664 tcp_mtup_init(sk);
3665 tcp_sync_mss(sk, dst_mtu(dst));
3666
3667 tcp_ca_dst_init(sk, dst);
3668
3669 if (!tp->window_clamp)
3670 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
3671 tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
3672
3673 tcp_initialize_rcv_mss(sk);
3674
3675
3676 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
3677 (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
3678 tp->window_clamp = tcp_full_space(sk);
3679
3680 rcv_wnd = tcp_rwnd_init_bpf(sk);
3681 if (rcv_wnd == 0)
3682 rcv_wnd = dst_metric(dst, RTAX_INITRWND);
3683
3684 tcp_select_initial_window(sk, tcp_full_space(sk),
3685 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
3686 &tp->rcv_wnd,
3687 &tp->window_clamp,
3688 sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
3689 &rcv_wscale,
3690 rcv_wnd);
3691
3692 tp->rx_opt.rcv_wscale = rcv_wscale;
3693 tp->rcv_ssthresh = tp->rcv_wnd;
3694
3695 sk->sk_err = 0;
3696 sock_reset_flag(sk, SOCK_DONE);
3697 tp->snd_wnd = 0;
3698 tcp_init_wl(tp, 0);
3699 tcp_write_queue_purge(sk);
3700 tp->snd_una = tp->write_seq;
3701 tp->snd_sml = tp->write_seq;
3702 tp->snd_up = tp->write_seq;
3703 WRITE_ONCE(tp->snd_nxt, tp->write_seq);
3704
3705 if (likely(!tp->repair))
3706 tp->rcv_nxt = 0;
3707 else
3708 tp->rcv_tstamp = tcp_jiffies32;
3709 tp->rcv_wup = tp->rcv_nxt;
3710 WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
3711
3712 inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
3713 inet_csk(sk)->icsk_retransmits = 0;
3714 tcp_clear_retrans(tp);
3715}
3716
3717static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
3718{
3719 struct tcp_sock *tp = tcp_sk(sk);
3720 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
3721
3722 tcb->end_seq += skb->len;
3723 __skb_header_release(skb);
3724 sk_wmem_queued_add(sk, skb->truesize);
3725 sk_mem_charge(sk, skb->truesize);
3726 WRITE_ONCE(tp->write_seq, tcb->end_seq);
3727 tp->packets_out += tcp_skb_pcount(skb);
3728}
3729
3730
3731
3732
3733
3734
3735
3736
3737static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
3738{
3739 struct tcp_sock *tp = tcp_sk(sk);
3740 struct tcp_fastopen_request *fo = tp->fastopen_req;
3741 int space, err = 0;
3742 struct sk_buff *syn_data;
3743
3744 tp->rx_opt.mss_clamp = tp->advmss;
3745 if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
3746 goto fallback;
3747
3748
3749
3750
3751
3752 tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
3753
3754 space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
3755 MAX_TCP_OPTION_SPACE;
3756
3757 space = min_t(size_t, space, fo->size);
3758
3759
3760 space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
3761
3762 syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false);
3763 if (!syn_data)
3764 goto fallback;
3765 syn_data->ip_summed = CHECKSUM_PARTIAL;
3766 memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
3767 if (space) {
3768 int copied = copy_from_iter(skb_put(syn_data, space), space,
3769 &fo->data->msg_iter);
3770 if (unlikely(!copied)) {
3771 tcp_skb_tsorted_anchor_cleanup(syn_data);
3772 kfree_skb(syn_data);
3773 goto fallback;
3774 }
3775 if (copied != space) {
3776 skb_trim(syn_data, copied);
3777 space = copied;
3778 }
3779 skb_zcopy_set(syn_data, fo->uarg, NULL);
3780 }
3781
3782 if (space == fo->size)
3783 fo->data = NULL;
3784 fo->copied = space;
3785
3786 tcp_connect_queue_skb(sk, syn_data);
3787 if (syn_data->len)
3788 tcp_chrono_start(sk, TCP_CHRONO_BUSY);
3789
3790 err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
3791
3792 syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;
3793
3794
3795
3796
3797
3798
3799 TCP_SKB_CB(syn_data)->seq++;
3800 TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
3801 if (!err) {
3802 tp->syn_data = (fo->copied > 0);
3803 tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
3804 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
3805 goto done;
3806 }
3807
3808
3809 __skb_queue_tail(&sk->sk_write_queue, syn_data);
3810 tp->packets_out -= tcp_skb_pcount(syn_data);
3811
3812fallback:
3813
3814 if (fo->cookie.len > 0)
3815 fo->cookie.len = 0;
3816 err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
3817 if (err)
3818 tp->syn_fastopen = 0;
3819done:
3820 fo->cookie.len = -1;
3821 return err;
3822}
3823
3824
3825int tcp_connect(struct sock *sk)
3826{
3827 struct tcp_sock *tp = tcp_sk(sk);
3828 struct sk_buff *buff;
3829 int err;
3830
3831 tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
3832
3833 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
3834 return -EHOSTUNREACH;
3835
3836 tcp_connect_init(sk);
3837
3838 if (unlikely(tp->repair)) {
3839 tcp_finish_connect(sk, NULL);
3840 return 0;
3841 }
3842
3843 buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
3844 if (unlikely(!buff))
3845 return -ENOBUFS;
3846
3847 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
3848 tcp_mstamp_refresh(tp);
3849 tp->retrans_stamp = tcp_time_stamp(tp);
3850 tcp_connect_queue_skb(sk, buff);
3851 tcp_ecn_send_syn(sk, buff);
3852 tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
3853
3854
3855 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
3856 tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
3857 if (err == -ECONNREFUSED)
3858 return err;
3859
3860
3861
3862
3863 WRITE_ONCE(tp->snd_nxt, tp->write_seq);
3864 tp->pushed_seq = tp->write_seq;
3865 buff = tcp_send_head(sk);
3866 if (unlikely(buff)) {
3867 WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq);
3868 tp->pushed_seq = TCP_SKB_CB(buff)->seq;
3869 }
3870 TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
3871
3872
3873 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
3874 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
3875 return 0;
3876}
3877EXPORT_SYMBOL(tcp_connect);
3878
3879
3880
3881
3882
3883void tcp_send_delayed_ack(struct sock *sk)
3884{
3885 struct inet_connection_sock *icsk = inet_csk(sk);
3886 int ato = icsk->icsk_ack.ato;
3887 unsigned long timeout;
3888
3889 if (ato > TCP_DELACK_MIN) {
3890 const struct tcp_sock *tp = tcp_sk(sk);
3891 int max_ato = HZ / 2;
3892
3893 if (inet_csk_in_pingpong_mode(sk) ||
3894 (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
3895 max_ato = TCP_DELACK_MAX;
3896
3897
3898
3899
3900
3901
3902
3903 if (tp->srtt_us) {
3904 int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
3905 TCP_DELACK_MIN);
3906
3907 if (rtt < max_ato)
3908 max_ato = rtt;
3909 }
3910
3911 ato = min(ato, max_ato);
3912 }
3913
3914 ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max);
3915
3916
3917 timeout = jiffies + ato;
3918
3919
3920 if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
3921
3922 if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
3923 tcp_send_ack(sk);
3924 return;
3925 }
3926
3927 if (!time_before(timeout, icsk->icsk_ack.timeout))
3928 timeout = icsk->icsk_ack.timeout;
3929 }
3930 icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
3931 icsk->icsk_ack.timeout = timeout;
3932 sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
3933}
3934
3935
3936void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
3937{
3938 struct sk_buff *buff;
3939
3940
3941 if (sk->sk_state == TCP_CLOSE)
3942 return;
3943
3944
3945
3946
3947
3948 buff = alloc_skb(MAX_TCP_HEADER,
3949 sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
3950 if (unlikely(!buff)) {
3951 struct inet_connection_sock *icsk = inet_csk(sk);
3952 unsigned long delay;
3953
3954 delay = TCP_DELACK_MAX << icsk->icsk_ack.retry;
3955 if (delay < TCP_RTO_MAX)
3956 icsk->icsk_ack.retry++;
3957 inet_csk_schedule_ack(sk);
3958 icsk->icsk_ack.ato = TCP_ATO_MIN;
3959 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX);
3960 return;
3961 }
3962
3963
3964 skb_reserve(buff, MAX_TCP_HEADER);
3965 tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
3966
3967
3968
3969
3970
3971 skb_set_tcp_pure_ack(buff);
3972
3973
3974 __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
3975}
3976EXPORT_SYMBOL_GPL(__tcp_send_ack);
3977
3978void tcp_send_ack(struct sock *sk)
3979{
3980 __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
3981}
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
3995{
3996 struct tcp_sock *tp = tcp_sk(sk);
3997 struct sk_buff *skb;
3998
3999
4000 skb = alloc_skb(MAX_TCP_HEADER,
4001 sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
4002 if (!skb)
4003 return -1;
4004
4005
4006 skb_reserve(skb, MAX_TCP_HEADER);
4007
4008
4009
4010
4011 tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
4012 NET_INC_STATS(sock_net(sk), mib);
4013 return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
4014}
4015
4016
4017void tcp_send_window_probe(struct sock *sk)
4018{
4019 if (sk->sk_state == TCP_ESTABLISHED) {
4020 tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
4021 tcp_mstamp_refresh(tcp_sk(sk));
4022 tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
4023 }
4024}
4025
4026
4027int tcp_write_wakeup(struct sock *sk, int mib)
4028{
4029 struct tcp_sock *tp = tcp_sk(sk);
4030 struct sk_buff *skb;
4031
4032 if (sk->sk_state == TCP_CLOSE)
4033 return -1;
4034
4035 skb = tcp_send_head(sk);
4036 if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
4037 int err;
4038 unsigned int mss = tcp_current_mss(sk);
4039 unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
4040
4041 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
4042 tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
4043
4044
4045
4046
4047
4048 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
4049 skb->len > mss) {
4050 seg_size = min(seg_size, mss);
4051 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
4052 if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
4053 skb, seg_size, mss, GFP_ATOMIC))
4054 return -1;
4055 } else if (!tcp_skb_pcount(skb))
4056 tcp_set_skb_tso_segs(skb, mss);
4057
4058 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
4059 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
4060 if (!err)
4061 tcp_event_new_data_sent(sk, skb);
4062 return err;
4063 } else {
4064 if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
4065 tcp_xmit_probe_skb(sk, 1, mib);
4066 return tcp_xmit_probe_skb(sk, 0, mib);
4067 }
4068}
4069
4070
4071
4072
4073void tcp_send_probe0(struct sock *sk)
4074{
4075 struct inet_connection_sock *icsk = inet_csk(sk);
4076 struct tcp_sock *tp = tcp_sk(sk);
4077 struct net *net = sock_net(sk);
4078 unsigned long timeout;
4079 int err;
4080
4081 err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
4082
4083 if (tp->packets_out || tcp_write_queue_empty(sk)) {
4084
4085 icsk->icsk_probes_out = 0;
4086 icsk->icsk_backoff = 0;
4087 icsk->icsk_probes_tstamp = 0;
4088 return;
4089 }
4090
4091 icsk->icsk_probes_out++;
4092 if (err <= 0) {
4093 if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
4094 icsk->icsk_backoff++;
4095 timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
4096 } else {
4097
4098
4099
4100 timeout = TCP_RESOURCE_PROBE_INTERVAL;
4101 }
4102
4103 timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout);
4104 tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX);
4105}
4106
4107int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
4108{
4109 const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
4110 struct flowi fl;
4111 int res;
4112
4113 tcp_rsk(req)->txhash = net_tx_rndhash();
4114 res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
4115 NULL);
4116 if (!res) {
4117 __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
4118 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
4119 if (unlikely(tcp_passive_fastopen(sk)))
4120 tcp_sk(sk)->total_retrans++;
4121 trace_tcp_retransmit_synack(sk, req);
4122 }
4123 return res;
4124}
4125EXPORT_SYMBOL(tcp_rtx_synack);
4126