1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37#define pr_fmt(fmt) "TCP: " fmt
38
39#include <net/tcp.h>
40#include <net/mptcp.h>
41
42#include <linux/compiler.h>
43#include <linux/gfp.h>
44#include <linux/module.h>
45#include <linux/static_key.h>
46
47#include <trace/events/tcp.h>
48
49
50
51
52void tcp_mstamp_refresh(struct tcp_sock *tp)
53{
54 u64 val = tcp_clock_ns();
55
56 if (val > tp->tcp_clock_cache)
57 tp->tcp_clock_cache = val;
58
59 val = div_u64(val, NSEC_PER_USEC);
60 if (val > tp->tcp_mstamp)
61 tp->tcp_mstamp = val;
62}
63
64static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
65 int push_one, gfp_t gfp);
66
67
68static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
69{
70 struct inet_connection_sock *icsk = inet_csk(sk);
71 struct tcp_sock *tp = tcp_sk(sk);
72 unsigned int prior_packets = tp->packets_out;
73
74 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
75
76 __skb_unlink(skb, &sk->sk_write_queue);
77 tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
78
79 if (tp->highest_sack == NULL)
80 tp->highest_sack = skb;
81
82 tp->packets_out += tcp_skb_pcount(skb);
83 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
84 tcp_rearm_rto(sk);
85
86 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
87 tcp_skb_pcount(skb));
88}
89
90
91
92
93
94
95
96
97static inline __u32 tcp_acceptable_seq(const struct sock *sk)
98{
99 const struct tcp_sock *tp = tcp_sk(sk);
100
101 if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
102 (tp->rx_opt.wscale_ok &&
103 ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
104 return tp->snd_nxt;
105 else
106 return tcp_wnd_end(tp);
107}
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123static __u16 tcp_advertise_mss(struct sock *sk)
124{
125 struct tcp_sock *tp = tcp_sk(sk);
126 const struct dst_entry *dst = __sk_dst_get(sk);
127 int mss = tp->advmss;
128
129 if (dst) {
130 unsigned int metric = dst_metric_advmss(dst);
131
132 if (metric < mss) {
133 mss = metric;
134 tp->advmss = mss;
135 }
136 }
137
138 return (__u16)mss;
139}
140
141
142
143
144void tcp_cwnd_restart(struct sock *sk, s32 delta)
145{
146 struct tcp_sock *tp = tcp_sk(sk);
147 u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
148 u32 cwnd = tp->snd_cwnd;
149
150 tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
151
152 tp->snd_ssthresh = tcp_current_ssthresh(sk);
153 restart_cwnd = min(restart_cwnd, cwnd);
154
155 while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
156 cwnd >>= 1;
157 tp->snd_cwnd = max(cwnd, restart_cwnd);
158 tp->snd_cwnd_stamp = tcp_jiffies32;
159 tp->snd_cwnd_used = 0;
160}
161
162
163static void tcp_event_data_sent(struct tcp_sock *tp,
164 struct sock *sk)
165{
166 struct inet_connection_sock *icsk = inet_csk(sk);
167 const u32 now = tcp_jiffies32;
168
169 if (tcp_packets_in_flight(tp) == 0)
170 tcp_ca_event(sk, CA_EVENT_TX_START);
171
172 tp->lsndtime = now;
173
174
175
176
177 if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
178 inet_csk_enter_pingpong_mode(sk);
179}
180
181
182static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
183 u32 rcv_nxt)
184{
185 struct tcp_sock *tp = tcp_sk(sk);
186
187 if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) {
188 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
189 tp->compressed_ack - TCP_FASTRETRANS_THRESH);
190 tp->compressed_ack = TCP_FASTRETRANS_THRESH;
191 if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
192 __sock_put(sk);
193 }
194
195 if (unlikely(rcv_nxt != tp->rcv_nxt))
196 return;
197 tcp_dec_quickack_mode(sk, pkts);
198 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
199}
200
201
202u32 tcp_default_init_rwnd(u32 mss)
203{
204
205
206
207
208
209 u32 init_rwnd = TCP_INIT_CWND * 2;
210
211 if (mss > 1460)
212 init_rwnd = max((1460 * init_rwnd) / mss, 2U);
213 return init_rwnd;
214}
215
216
217
218
219
220
221
222
223void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
224 __u32 *rcv_wnd, __u32 *window_clamp,
225 int wscale_ok, __u8 *rcv_wscale,
226 __u32 init_rcv_wnd)
227{
228 unsigned int space = (__space < 0 ? 0 : __space);
229
230
231 if (*window_clamp == 0)
232 (*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
233 space = min(*window_clamp, space);
234
235
236 if (space > mss)
237 space = rounddown(space, mss);
238
239
240
241
242
243
244
245
246
247 if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
248 (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
249 else
250 (*rcv_wnd) = space;
251
252 (*rcv_wscale) = 0;
253 if (wscale_ok) {
254
255 space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
256 space = max_t(u32, space, sysctl_rmem_max);
257 space = min_t(u32, space, *window_clamp);
258 while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
259 space >>= 1;
260 (*rcv_wscale)++;
261 }
262 }
263
264 if (!init_rcv_wnd)
265 init_rcv_wnd = tcp_default_init_rwnd(mss);
266 *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
267
268
269 (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
270}
271EXPORT_SYMBOL(tcp_select_initial_window);
272
273
274
275
276
277
278static u16 tcp_select_window(struct sock *sk)
279{
280 struct tcp_sock *tp = tcp_sk(sk);
281 u32 old_win = tp->rcv_wnd;
282 u32 cur_win = tcp_receive_window(tp);
283 u32 new_win = __tcp_select_window(sk);
284
285
286 if (new_win < cur_win) {
287
288
289
290
291
292
293
294 if (new_win == 0)
295 NET_INC_STATS(sock_net(sk),
296 LINUX_MIB_TCPWANTZEROWINDOWADV);
297 new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
298 }
299 tp->rcv_wnd = new_win;
300 tp->rcv_wup = tp->rcv_nxt;
301
302
303
304
305 if (!tp->rx_opt.rcv_wscale &&
306 sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
307 new_win = min(new_win, MAX_TCP_WINDOW);
308 else
309 new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
310
311
312 new_win >>= tp->rx_opt.rcv_wscale;
313
314
315 if (new_win == 0) {
316 tp->pred_flags = 0;
317 if (old_win)
318 NET_INC_STATS(sock_net(sk),
319 LINUX_MIB_TCPTOZEROWINDOWADV);
320 } else if (old_win == 0) {
321 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
322 }
323
324 return new_win;
325}
326
327
328static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
329{
330 const struct tcp_sock *tp = tcp_sk(sk);
331
332 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
333 if (!(tp->ecn_flags & TCP_ECN_OK))
334 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
335 else if (tcp_ca_needs_ecn(sk) ||
336 tcp_bpf_ca_needs_ecn(sk))
337 INET_ECN_xmit(sk);
338}
339
340
341static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
342{
343 struct tcp_sock *tp = tcp_sk(sk);
344 bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
345 bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
346 tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
347
348 if (!use_ecn) {
349 const struct dst_entry *dst = __sk_dst_get(sk);
350
351 if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
352 use_ecn = true;
353 }
354
355 tp->ecn_flags = 0;
356
357 if (use_ecn) {
358 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
359 tp->ecn_flags = TCP_ECN_OK;
360 if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
361 INET_ECN_xmit(sk);
362 }
363}
364
365static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
366{
367 if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)
368
369
370
371 TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
372}
373
374static void
375tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
376{
377 if (inet_rsk(req)->ecn_ok)
378 th->ece = 1;
379}
380
381
382
383
384static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
385 struct tcphdr *th, int tcp_header_len)
386{
387 struct tcp_sock *tp = tcp_sk(sk);
388
389 if (tp->ecn_flags & TCP_ECN_OK) {
390
391 if (skb->len != tcp_header_len &&
392 !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
393 INET_ECN_xmit(sk);
394 if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
395 tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
396 th->cwr = 1;
397 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
398 }
399 } else if (!tcp_ca_needs_ecn(sk)) {
400
401 INET_ECN_dontxmit(sk);
402 }
403 if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
404 th->ece = 1;
405 }
406}
407
408
409
410
411static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
412{
413 skb->ip_summed = CHECKSUM_PARTIAL;
414
415 TCP_SKB_CB(skb)->tcp_flags = flags;
416 TCP_SKB_CB(skb)->sacked = 0;
417
418 tcp_skb_pcount_set(skb, 1);
419
420 TCP_SKB_CB(skb)->seq = seq;
421 if (flags & (TCPHDR_SYN | TCPHDR_FIN))
422 seq++;
423 TCP_SKB_CB(skb)->end_seq = seq;
424}
425
426static inline bool tcp_urg_mode(const struct tcp_sock *tp)
427{
428 return tp->snd_una != tp->snd_up;
429}
430
431#define OPTION_SACK_ADVERTISE (1 << 0)
432#define OPTION_TS (1 << 1)
433#define OPTION_MD5 (1 << 2)
434#define OPTION_WSCALE (1 << 3)
435#define OPTION_FAST_OPEN_COOKIE (1 << 8)
436#define OPTION_SMC (1 << 9)
437#define OPTION_MPTCP (1 << 10)
438
439static void smc_options_write(__be32 *ptr, u16 *options)
440{
441#if IS_ENABLED(CONFIG_SMC)
442 if (static_branch_unlikely(&tcp_have_smc)) {
443 if (unlikely(OPTION_SMC & *options)) {
444 *ptr++ = htonl((TCPOPT_NOP << 24) |
445 (TCPOPT_NOP << 16) |
446 (TCPOPT_EXP << 8) |
447 (TCPOLEN_EXP_SMC_BASE));
448 *ptr++ = htonl(TCPOPT_SMC_MAGIC);
449 }
450 }
451#endif
452}
453
454struct tcp_out_options {
455 u16 options;
456 u16 mss;
457 u8 ws;
458 u8 num_sack_blocks;
459 u8 hash_size;
460 u8 bpf_opt_len;
461 __u8 *hash_location;
462 __u32 tsval, tsecr;
463 struct tcp_fastopen_cookie *fastopen_cookie;
464 struct mptcp_out_options mptcp;
465};
466
467static void mptcp_options_write(__be32 *ptr, const struct tcp_sock *tp,
468 struct tcp_out_options *opts)
469{
470#if IS_ENABLED(CONFIG_MPTCP)
471 if (unlikely(OPTION_MPTCP & opts->options))
472 mptcp_write_options(ptr, tp, &opts->mptcp);
473#endif
474}
475
476#ifdef CONFIG_CGROUP_BPF
477static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb,
478 enum tcp_synack_type synack_type)
479{
480 if (unlikely(!skb))
481 return BPF_WRITE_HDR_TCP_CURRENT_MSS;
482
483 if (unlikely(synack_type == TCP_SYNACK_COOKIE))
484 return BPF_WRITE_HDR_TCP_SYNACK_COOKIE;
485
486 return 0;
487}
488
489
490static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
491 struct request_sock *req,
492 struct sk_buff *syn_skb,
493 enum tcp_synack_type synack_type,
494 struct tcp_out_options *opts,
495 unsigned int *remaining)
496{
497 struct bpf_sock_ops_kern sock_ops;
498 int err;
499
500 if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
501 BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) ||
502 !*remaining)
503 return;
504
505
506
507
508 memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
509
510 sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB;
511
512 if (req) {
513
514
515
516
517
518
519
520
521
522
523
524
525
526 sock_ops.sk = (struct sock *)req;
527 sock_ops.syn_skb = syn_skb;
528 } else {
529 sock_owned_by_me(sk);
530
531 sock_ops.is_fullsock = 1;
532 sock_ops.sk = sk;
533 }
534
535 sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
536 sock_ops.remaining_opt_len = *remaining;
537
538 if (skb)
539 bpf_skops_init_skb(&sock_ops, skb, 0);
540
541 err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
542
543 if (err || sock_ops.remaining_opt_len == *remaining)
544 return;
545
546 opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len;
547
548 opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3;
549
550 *remaining -= opts->bpf_opt_len;
551}
552
553static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
554 struct request_sock *req,
555 struct sk_buff *syn_skb,
556 enum tcp_synack_type synack_type,
557 struct tcp_out_options *opts)
558{
559 u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len;
560 struct bpf_sock_ops_kern sock_ops;
561 int err;
562
563 if (likely(!max_opt_len))
564 return;
565
566 memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
567
568 sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
569
570 if (req) {
571 sock_ops.sk = (struct sock *)req;
572 sock_ops.syn_skb = syn_skb;
573 } else {
574 sock_owned_by_me(sk);
575
576 sock_ops.is_fullsock = 1;
577 sock_ops.sk = sk;
578 }
579
580 sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
581 sock_ops.remaining_opt_len = max_opt_len;
582 first_opt_off = tcp_hdrlen(skb) - max_opt_len;
583 bpf_skops_init_skb(&sock_ops, skb, first_opt_off);
584
585 err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
586
587 if (err)
588 nr_written = 0;
589 else
590 nr_written = max_opt_len - sock_ops.remaining_opt_len;
591
592 if (nr_written < max_opt_len)
593 memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP,
594 max_opt_len - nr_written);
595}
596#else
597static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
598 struct request_sock *req,
599 struct sk_buff *syn_skb,
600 enum tcp_synack_type synack_type,
601 struct tcp_out_options *opts,
602 unsigned int *remaining)
603{
604}
605
606static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
607 struct request_sock *req,
608 struct sk_buff *syn_skb,
609 enum tcp_synack_type synack_type,
610 struct tcp_out_options *opts)
611{
612}
613#endif
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
629 struct tcp_out_options *opts)
630{
631 u16 options = opts->options;
632
633 if (unlikely(OPTION_MD5 & options)) {
634 *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
635 (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
636
637 opts->hash_location = (__u8 *)ptr;
638 ptr += 4;
639 }
640
641 if (unlikely(opts->mss)) {
642 *ptr++ = htonl((TCPOPT_MSS << 24) |
643 (TCPOLEN_MSS << 16) |
644 opts->mss);
645 }
646
647 if (likely(OPTION_TS & options)) {
648 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
649 *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
650 (TCPOLEN_SACK_PERM << 16) |
651 (TCPOPT_TIMESTAMP << 8) |
652 TCPOLEN_TIMESTAMP);
653 options &= ~OPTION_SACK_ADVERTISE;
654 } else {
655 *ptr++ = htonl((TCPOPT_NOP << 24) |
656 (TCPOPT_NOP << 16) |
657 (TCPOPT_TIMESTAMP << 8) |
658 TCPOLEN_TIMESTAMP);
659 }
660 *ptr++ = htonl(opts->tsval);
661 *ptr++ = htonl(opts->tsecr);
662 }
663
664 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
665 *ptr++ = htonl((TCPOPT_NOP << 24) |
666 (TCPOPT_NOP << 16) |
667 (TCPOPT_SACK_PERM << 8) |
668 TCPOLEN_SACK_PERM);
669 }
670
671 if (unlikely(OPTION_WSCALE & options)) {
672 *ptr++ = htonl((TCPOPT_NOP << 24) |
673 (TCPOPT_WINDOW << 16) |
674 (TCPOLEN_WINDOW << 8) |
675 opts->ws);
676 }
677
678 if (unlikely(opts->num_sack_blocks)) {
679 struct tcp_sack_block *sp = tp->rx_opt.dsack ?
680 tp->duplicate_sack : tp->selective_acks;
681 int this_sack;
682
683 *ptr++ = htonl((TCPOPT_NOP << 24) |
684 (TCPOPT_NOP << 16) |
685 (TCPOPT_SACK << 8) |
686 (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
687 TCPOLEN_SACK_PERBLOCK)));
688
689 for (this_sack = 0; this_sack < opts->num_sack_blocks;
690 ++this_sack) {
691 *ptr++ = htonl(sp[this_sack].start_seq);
692 *ptr++ = htonl(sp[this_sack].end_seq);
693 }
694
695 tp->rx_opt.dsack = 0;
696 }
697
698 if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
699 struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
700 u8 *p = (u8 *)ptr;
701 u32 len;
702
703 if (foc->exp) {
704 len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
705 *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
706 TCPOPT_FASTOPEN_MAGIC);
707 p += TCPOLEN_EXP_FASTOPEN_BASE;
708 } else {
709 len = TCPOLEN_FASTOPEN_BASE + foc->len;
710 *p++ = TCPOPT_FASTOPEN;
711 *p++ = len;
712 }
713
714 memcpy(p, foc->val, foc->len);
715 if ((len & 3) == 2) {
716 p[foc->len] = TCPOPT_NOP;
717 p[foc->len + 1] = TCPOPT_NOP;
718 }
719 ptr += (len + 3) >> 2;
720 }
721
722 smc_options_write(ptr, &options);
723
724 mptcp_options_write(ptr, tp, opts);
725}
726
727static void smc_set_option(const struct tcp_sock *tp,
728 struct tcp_out_options *opts,
729 unsigned int *remaining)
730{
731#if IS_ENABLED(CONFIG_SMC)
732 if (static_branch_unlikely(&tcp_have_smc)) {
733 if (tp->syn_smc) {
734 if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
735 opts->options |= OPTION_SMC;
736 *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
737 }
738 }
739 }
740#endif
741}
742
743static void smc_set_option_cond(const struct tcp_sock *tp,
744 const struct inet_request_sock *ireq,
745 struct tcp_out_options *opts,
746 unsigned int *remaining)
747{
748#if IS_ENABLED(CONFIG_SMC)
749 if (static_branch_unlikely(&tcp_have_smc)) {
750 if (tp->syn_smc && ireq->smc_ok) {
751 if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
752 opts->options |= OPTION_SMC;
753 *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
754 }
755 }
756 }
757#endif
758}
759
760static void mptcp_set_option_cond(const struct request_sock *req,
761 struct tcp_out_options *opts,
762 unsigned int *remaining)
763{
764 if (rsk_is_mptcp(req)) {
765 unsigned int size;
766
767 if (mptcp_synack_options(req, &size, &opts->mptcp)) {
768 if (*remaining >= size) {
769 opts->options |= OPTION_MPTCP;
770 *remaining -= size;
771 }
772 }
773 }
774}
775
776
777
778
779static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
780 struct tcp_out_options *opts,
781 struct tcp_md5sig_key **md5)
782{
783 struct tcp_sock *tp = tcp_sk(sk);
784 unsigned int remaining = MAX_TCP_OPTION_SPACE;
785 struct tcp_fastopen_request *fastopen = tp->fastopen_req;
786
787 *md5 = NULL;
788#ifdef CONFIG_TCP_MD5SIG
789 if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
790 *md5 = tp->af_specific->md5_lookup(sk, sk);
791 if (*md5) {
792 opts->options |= OPTION_MD5;
793 remaining -= TCPOLEN_MD5SIG_ALIGNED;
794 }
795 }
796#endif
797
798
799
800
801
802
803
804
805
806
807 opts->mss = tcp_advertise_mss(sk);
808 remaining -= TCPOLEN_MSS_ALIGNED;
809
810 if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) {
811 opts->options |= OPTION_TS;
812 opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
813 opts->tsecr = tp->rx_opt.ts_recent;
814 remaining -= TCPOLEN_TSTAMP_ALIGNED;
815 }
816 if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) {
817 opts->ws = tp->rx_opt.rcv_wscale;
818 opts->options |= OPTION_WSCALE;
819 remaining -= TCPOLEN_WSCALE_ALIGNED;
820 }
821 if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) {
822 opts->options |= OPTION_SACK_ADVERTISE;
823 if (unlikely(!(OPTION_TS & opts->options)))
824 remaining -= TCPOLEN_SACKPERM_ALIGNED;
825 }
826
827 if (fastopen && fastopen->cookie.len >= 0) {
828 u32 need = fastopen->cookie.len;
829
830 need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
831 TCPOLEN_FASTOPEN_BASE;
832 need = (need + 3) & ~3U;
833 if (remaining >= need) {
834 opts->options |= OPTION_FAST_OPEN_COOKIE;
835 opts->fastopen_cookie = &fastopen->cookie;
836 remaining -= need;
837 tp->syn_fastopen = 1;
838 tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
839 }
840 }
841
842 smc_set_option(tp, opts, &remaining);
843
844 if (sk_is_mptcp(sk)) {
845 unsigned int size;
846
847 if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) {
848 opts->options |= OPTION_MPTCP;
849 remaining -= size;
850 }
851 }
852
853 bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
854
855 return MAX_TCP_OPTION_SPACE - remaining;
856}
857
858
859static unsigned int tcp_synack_options(const struct sock *sk,
860 struct request_sock *req,
861 unsigned int mss, struct sk_buff *skb,
862 struct tcp_out_options *opts,
863 const struct tcp_md5sig_key *md5,
864 struct tcp_fastopen_cookie *foc,
865 enum tcp_synack_type synack_type,
866 struct sk_buff *syn_skb)
867{
868 struct inet_request_sock *ireq = inet_rsk(req);
869 unsigned int remaining = MAX_TCP_OPTION_SPACE;
870
871#ifdef CONFIG_TCP_MD5SIG
872 if (md5) {
873 opts->options |= OPTION_MD5;
874 remaining -= TCPOLEN_MD5SIG_ALIGNED;
875
876
877
878
879
880
881 if (synack_type != TCP_SYNACK_COOKIE)
882 ireq->tstamp_ok &= !ireq->sack_ok;
883 }
884#endif
885
886
887 opts->mss = mss;
888 remaining -= TCPOLEN_MSS_ALIGNED;
889
890 if (likely(ireq->wscale_ok)) {
891 opts->ws = ireq->rcv_wscale;
892 opts->options |= OPTION_WSCALE;
893 remaining -= TCPOLEN_WSCALE_ALIGNED;
894 }
895 if (likely(ireq->tstamp_ok)) {
896 opts->options |= OPTION_TS;
897 opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
898 opts->tsecr = req->ts_recent;
899 remaining -= TCPOLEN_TSTAMP_ALIGNED;
900 }
901 if (likely(ireq->sack_ok)) {
902 opts->options |= OPTION_SACK_ADVERTISE;
903 if (unlikely(!ireq->tstamp_ok))
904 remaining -= TCPOLEN_SACKPERM_ALIGNED;
905 }
906 if (foc != NULL && foc->len >= 0) {
907 u32 need = foc->len;
908
909 need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
910 TCPOLEN_FASTOPEN_BASE;
911 need = (need + 3) & ~3U;
912 if (remaining >= need) {
913 opts->options |= OPTION_FAST_OPEN_COOKIE;
914 opts->fastopen_cookie = foc;
915 remaining -= need;
916 }
917 }
918
919 mptcp_set_option_cond(req, opts, &remaining);
920
921 smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
922
923 bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
924 synack_type, opts, &remaining);
925
926 return MAX_TCP_OPTION_SPACE - remaining;
927}
928
929
930
931
932static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
933 struct tcp_out_options *opts,
934 struct tcp_md5sig_key **md5)
935{
936 struct tcp_sock *tp = tcp_sk(sk);
937 unsigned int size = 0;
938 unsigned int eff_sacks;
939
940 opts->options = 0;
941
942 *md5 = NULL;
943#ifdef CONFIG_TCP_MD5SIG
944 if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
945 *md5 = tp->af_specific->md5_lookup(sk, sk);
946 if (*md5) {
947 opts->options |= OPTION_MD5;
948 size += TCPOLEN_MD5SIG_ALIGNED;
949 }
950 }
951#endif
952
953 if (likely(tp->rx_opt.tstamp_ok)) {
954 opts->options |= OPTION_TS;
955 opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
956 opts->tsecr = tp->rx_opt.ts_recent;
957 size += TCPOLEN_TSTAMP_ALIGNED;
958 }
959
960
961
962
963
964
965
966 if (sk_is_mptcp(sk)) {
967 unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
968 unsigned int opt_size = 0;
969
970 if (mptcp_established_options(sk, skb, &opt_size, remaining,
971 &opts->mptcp)) {
972 opts->options |= OPTION_MPTCP;
973 size += opt_size;
974 }
975 }
976
977 eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
978 if (unlikely(eff_sacks)) {
979 const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
980 if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
981 TCPOLEN_SACK_PERBLOCK))
982 return size;
983
984 opts->num_sack_blocks =
985 min_t(unsigned int, eff_sacks,
986 (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
987 TCPOLEN_SACK_PERBLOCK);
988
989 size += TCPOLEN_SACK_BASE_ALIGNED +
990 opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
991 }
992
993 if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
994 BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
995 unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
996
997 bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
998
999 size = MAX_TCP_OPTION_SPACE - remaining;
1000 }
1001
1002 return size;
1003}
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020struct tsq_tasklet {
1021 struct tasklet_struct tasklet;
1022 struct list_head head;
1023};
1024static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
1025
1026static void tcp_tsq_write(struct sock *sk)
1027{
1028 if ((1 << sk->sk_state) &
1029 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
1030 TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) {
1031 struct tcp_sock *tp = tcp_sk(sk);
1032
1033 if (tp->lost_out > tp->retrans_out &&
1034 tp->snd_cwnd > tcp_packets_in_flight(tp)) {
1035 tcp_mstamp_refresh(tp);
1036 tcp_xmit_retransmit_queue(sk);
1037 }
1038
1039 tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
1040 0, GFP_ATOMIC);
1041 }
1042}
1043
1044static void tcp_tsq_handler(struct sock *sk)
1045{
1046 bh_lock_sock(sk);
1047 if (!sock_owned_by_user(sk))
1048 tcp_tsq_write(sk);
1049 else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
1050 sock_hold(sk);
1051 bh_unlock_sock(sk);
1052}
1053
1054
1055
1056
1057
1058
1059static void tcp_tasklet_func(unsigned long data)
1060{
1061 struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
1062 LIST_HEAD(list);
1063 unsigned long flags;
1064 struct list_head *q, *n;
1065 struct tcp_sock *tp;
1066 struct sock *sk;
1067
1068 local_irq_save(flags);
1069 list_splice_init(&tsq->head, &list);
1070 local_irq_restore(flags);
1071
1072 list_for_each_safe(q, n, &list) {
1073 tp = list_entry(q, struct tcp_sock, tsq_node);
1074 list_del(&tp->tsq_node);
1075
1076 sk = (struct sock *)tp;
1077 smp_mb__before_atomic();
1078 clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
1079
1080 tcp_tsq_handler(sk);
1081 sk_free(sk);
1082 }
1083}
1084
1085#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
1086 TCPF_WRITE_TIMER_DEFERRED | \
1087 TCPF_DELACK_TIMER_DEFERRED | \
1088 TCPF_MTU_REDUCED_DEFERRED)
1089
1090
1091
1092
1093
1094
1095
1096void tcp_release_cb(struct sock *sk)
1097{
1098 unsigned long flags, nflags;
1099
1100
1101 do {
1102 flags = sk->sk_tsq_flags;
1103 if (!(flags & TCP_DEFERRED_ALL))
1104 return;
1105 nflags = flags & ~TCP_DEFERRED_ALL;
1106 } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
1107
1108 if (flags & TCPF_TSQ_DEFERRED) {
1109 tcp_tsq_write(sk);
1110 __sock_put(sk);
1111 }
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121 sock_release_ownership(sk);
1122
1123 if (flags & TCPF_WRITE_TIMER_DEFERRED) {
1124 tcp_write_timer_handler(sk);
1125 __sock_put(sk);
1126 }
1127 if (flags & TCPF_DELACK_TIMER_DEFERRED) {
1128 tcp_delack_timer_handler(sk);
1129 __sock_put(sk);
1130 }
1131 if (flags & TCPF_MTU_REDUCED_DEFERRED) {
1132 inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
1133 __sock_put(sk);
1134 }
1135}
1136EXPORT_SYMBOL(tcp_release_cb);
1137
1138void __init tcp_tasklet_init(void)
1139{
1140 int i;
1141
1142 for_each_possible_cpu(i) {
1143 struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
1144
1145 INIT_LIST_HEAD(&tsq->head);
1146 tasklet_init(&tsq->tasklet,
1147 tcp_tasklet_func,
1148 (unsigned long)tsq);
1149 }
1150}
1151
1152
1153
1154
1155
1156
1157void tcp_wfree(struct sk_buff *skb)
1158{
1159 struct sock *sk = skb->sk;
1160 struct tcp_sock *tp = tcp_sk(sk);
1161 unsigned long flags, nval, oval;
1162
1163
1164
1165
1166 WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
1167
1168
1169
1170
1171
1172
1173
1174
1175 if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
1176 goto out;
1177
1178 for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
1179 struct tsq_tasklet *tsq;
1180 bool empty;
1181
1182 if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
1183 goto out;
1184
1185 nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
1186 nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
1187 if (nval != oval)
1188 continue;
1189
1190
1191 local_irq_save(flags);
1192 tsq = this_cpu_ptr(&tsq_tasklet);
1193 empty = list_empty(&tsq->head);
1194 list_add(&tp->tsq_node, &tsq->head);
1195 if (empty)
1196 tasklet_schedule(&tsq->tasklet);
1197 local_irq_restore(flags);
1198 return;
1199 }
1200out:
1201 sk_free(sk);
1202}
1203
1204
1205
1206
1207enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
1208{
1209 struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
1210 struct sock *sk = (struct sock *)tp;
1211
1212 tcp_tsq_handler(sk);
1213 sock_put(sk);
1214
1215 return HRTIMER_NORESTART;
1216}
1217
1218static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
1219 u64 prior_wstamp)
1220{
1221 struct tcp_sock *tp = tcp_sk(sk);
1222
1223 skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
1224 if (sk->sk_pacing_status != SK_PACING_NONE) {
1225 u32 rate = sk->sk_pacing_rate;
1226
1227
1228
1229
1230
1231 if (rate != ~0U && rate && tp->data_segs_out >= 10) {
1232 u64 len_ns = div_u64((u64)skb->len * NSEC_PER_SEC, rate);
1233 u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
1234
1235
1236 len_ns -= min_t(u64, len_ns / 2, credit);
1237 tp->tcp_wstamp_ns += len_ns;
1238 }
1239 }
1240 list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
1241}
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
1255 int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
1256{
1257 const struct inet_connection_sock *icsk = inet_csk(sk);
1258 struct inet_sock *inet;
1259 struct tcp_sock *tp;
1260 struct tcp_skb_cb *tcb;
1261 struct tcp_out_options opts;
1262 unsigned int tcp_options_size, tcp_header_size;
1263 struct sk_buff *oskb = NULL;
1264 struct tcp_md5sig_key *md5;
1265 struct tcphdr *th;
1266 u64 prior_wstamp;
1267 int err;
1268
1269 BUG_ON(!skb || !tcp_skb_pcount(skb));
1270 tp = tcp_sk(sk);
1271
1272 if (clone_it) {
1273 TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
1274 - tp->snd_una;
1275 oskb = skb;
1276
1277 tcp_skb_tsorted_save(oskb) {
1278 if (unlikely(skb_cloned(oskb)))
1279 skb = pskb_copy(oskb, gfp_mask);
1280 else
1281 skb = skb_clone(oskb, gfp_mask);
1282 } tcp_skb_tsorted_restore(oskb);
1283
1284 if (unlikely(!skb))
1285 return -ENOBUFS;
1286
1287
1288
1289 skb->dev = NULL;
1290 }
1291
1292 prior_wstamp = tp->tcp_wstamp_ns;
1293 tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
1294
1295 skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
1296
1297 inet = inet_sk(sk);
1298 tcb = TCP_SKB_CB(skb);
1299 memset(&opts, 0, sizeof(opts));
1300
1301 if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
1302 tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
1303 else
1304 tcp_options_size = tcp_established_options(sk, skb, &opts,
1305 &md5);
1306 tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
1307
1308
1309
1310
1311
1312
1313
1314
1315 skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
1316
1317
1318
1319
1320
1321
1322 skb->pfmemalloc = 0;
1323
1324 skb_push(skb, tcp_header_size);
1325 skb_reset_transport_header(skb);
1326
1327 skb_orphan(skb);
1328 skb->sk = sk;
1329 skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
1330 skb_set_hash_from_sk(skb, sk);
1331 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1332
1333 skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
1334
1335
1336 th = (struct tcphdr *)skb->data;
1337 th->source = inet->inet_sport;
1338 th->dest = inet->inet_dport;
1339 th->seq = htonl(tcb->seq);
1340 th->ack_seq = htonl(rcv_nxt);
1341 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
1342 tcb->tcp_flags);
1343
1344 th->check = 0;
1345 th->urg_ptr = 0;
1346
1347
1348 if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
1349 if (before(tp->snd_up, tcb->seq + 0x10000)) {
1350 th->urg_ptr = htons(tp->snd_up - tcb->seq);
1351 th->urg = 1;
1352 } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
1353 th->urg_ptr = htons(0xFFFF);
1354 th->urg = 1;
1355 }
1356 }
1357
1358 skb_shinfo(skb)->gso_type = sk->sk_gso_type;
1359 if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
1360 th->window = htons(tcp_select_window(sk));
1361 tcp_ecn_send(sk, skb, th, tcp_header_size);
1362 } else {
1363
1364
1365
1366 th->window = htons(min(tp->rcv_wnd, 65535U));
1367 }
1368
1369 tcp_options_write((__be32 *)(th + 1), tp, &opts);
1370
1371#ifdef CONFIG_TCP_MD5SIG
1372
1373 if (md5) {
1374 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1375 tp->af_specific->calc_md5_hash(opts.hash_location,
1376 md5, sk, skb);
1377 }
1378#endif
1379
1380
1381 bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
1382
1383 icsk->icsk_af_ops->send_check(sk, skb);
1384
1385 if (likely(tcb->tcp_flags & TCPHDR_ACK))
1386 tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
1387
1388 if (skb->len != tcp_header_size) {
1389 tcp_event_data_sent(tp, sk);
1390 tp->data_segs_out += tcp_skb_pcount(skb);
1391 tp->bytes_sent += skb->len - tcp_header_size;
1392 }
1393
1394 if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
1395 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
1396 tcp_skb_pcount(skb));
1397
1398 tp->segs_out += tcp_skb_pcount(skb);
1399
1400 skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
1401 skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
1402
1403
1404
1405
1406 memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
1407 sizeof(struct inet6_skb_parm)));
1408
1409 err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
1410
1411 if (unlikely(err > 0)) {
1412 tcp_enter_cwr(sk);
1413 err = net_xmit_eval(err);
1414 }
1415 if (!err && oskb) {
1416 tcp_update_skb_after_send(sk, oskb, prior_wstamp);
1417 tcp_rate_skb_sent(sk, oskb);
1418 }
1419 return err;
1420}
1421
1422static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1423 gfp_t gfp_mask)
1424{
1425 return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
1426 tcp_sk(sk)->rcv_nxt);
1427}
1428
1429
1430
1431
1432
1433
1434static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
1435{
1436 struct tcp_sock *tp = tcp_sk(sk);
1437
1438
1439 WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
1440 __skb_header_release(skb);
1441 tcp_add_write_queue_tail(sk, skb);
1442 sk_wmem_queued_add(sk, skb->truesize);
1443 sk_mem_charge(sk, skb->truesize);
1444}
1445
1446
1447static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
1448{
1449 if (skb->len <= mss_now) {
1450
1451
1452
1453 tcp_skb_pcount_set(skb, 1);
1454 TCP_SKB_CB(skb)->tcp_gso_size = 0;
1455 } else {
1456 tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
1457 TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
1458 }
1459}
1460
1461
1462
1463
1464static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
1465{
1466 struct tcp_sock *tp = tcp_sk(sk);
1467
1468 tp->packets_out -= decr;
1469
1470 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1471 tp->sacked_out -= decr;
1472 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
1473 tp->retrans_out -= decr;
1474 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
1475 tp->lost_out -= decr;
1476
1477
1478 if (tcp_is_reno(tp) && decr > 0)
1479 tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
1480
1481 if (tp->lost_skb_hint &&
1482 before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
1483 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
1484 tp->lost_cnt_hint -= decr;
1485
1486 tcp_verify_left_out(tp);
1487}
1488
1489static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
1490{
1491 return TCP_SKB_CB(skb)->txstamp_ack ||
1492 (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
1493}
1494
1495static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
1496{
1497 struct skb_shared_info *shinfo = skb_shinfo(skb);
1498
1499 if (unlikely(tcp_has_tx_tstamp(skb)) &&
1500 !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
1501 struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
1502 u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
1503
1504 shinfo->tx_flags &= ~tsflags;
1505 shinfo2->tx_flags |= tsflags;
1506 swap(shinfo->tskey, shinfo2->tskey);
1507 TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
1508 TCP_SKB_CB(skb)->txstamp_ack = 0;
1509 }
1510}
1511
1512static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
1513{
1514 TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
1515 TCP_SKB_CB(skb)->eor = 0;
1516}
1517
1518
1519static void tcp_insert_write_queue_after(struct sk_buff *skb,
1520 struct sk_buff *buff,
1521 struct sock *sk,
1522 enum tcp_queue tcp_queue)
1523{
1524 if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
1525 __skb_queue_after(&sk->sk_write_queue, skb, buff);
1526 else
1527 tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
1528}
1529
1530
1531
1532
1533
1534
1535int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
1536 struct sk_buff *skb, u32 len,
1537 unsigned int mss_now, gfp_t gfp)
1538{
1539 struct tcp_sock *tp = tcp_sk(sk);
1540 struct sk_buff *buff;
1541 int nsize, old_factor;
1542 long limit;
1543 int nlen;
1544 u8 flags;
1545
1546 if (WARN_ON(len > skb->len))
1547 return -EINVAL;
1548
1549 nsize = skb_headlen(skb) - len;
1550 if (nsize < 0)
1551 nsize = 0;
1552
1553
1554
1555
1556
1557
1558 limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_MAX_SIZE);
1559 if (unlikely((sk->sk_wmem_queued >> 1) > limit &&
1560 tcp_queue != TCP_FRAG_IN_WRITE_QUEUE &&
1561 skb != tcp_rtx_queue_head(sk) &&
1562 skb != tcp_rtx_queue_tail(sk))) {
1563 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
1564 return -ENOMEM;
1565 }
1566
1567 if (skb_unclone(skb, gfp))
1568 return -ENOMEM;
1569
1570
1571 buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
1572 if (!buff)
1573 return -ENOMEM;
1574 skb_copy_decrypted(buff, skb);
1575 mptcp_skb_ext_copy(buff, skb);
1576
1577 sk_wmem_queued_add(sk, buff->truesize);
1578 sk_mem_charge(sk, buff->truesize);
1579 nlen = skb->len - len - nsize;
1580 buff->truesize += nlen;
1581 skb->truesize -= nlen;
1582
1583
1584 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
1585 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
1586 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
1587
1588
1589 flags = TCP_SKB_CB(skb)->tcp_flags;
1590 TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
1591 TCP_SKB_CB(buff)->tcp_flags = flags;
1592 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
1593 tcp_skb_fragment_eor(skb, buff);
1594
1595 skb_split(skb, buff, len);
1596
1597 buff->ip_summed = CHECKSUM_PARTIAL;
1598
1599 buff->tstamp = skb->tstamp;
1600 tcp_fragment_tstamp(skb, buff);
1601
1602 old_factor = tcp_skb_pcount(skb);
1603
1604
1605 tcp_set_skb_tso_segs(skb, mss_now);
1606 tcp_set_skb_tso_segs(buff, mss_now);
1607
1608
1609 TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
1610
1611
1612
1613
1614 if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
1615 int diff = old_factor - tcp_skb_pcount(skb) -
1616 tcp_skb_pcount(buff);
1617
1618 if (diff)
1619 tcp_adjust_pcount(sk, skb, diff);
1620 }
1621
1622
1623 __skb_header_release(buff);
1624 tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
1625 if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
1626 list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
1627
1628 return 0;
1629}
1630
1631
1632
1633
1634static int __pskb_trim_head(struct sk_buff *skb, int len)
1635{
1636 struct skb_shared_info *shinfo;
1637 int i, k, eat;
1638
1639 eat = min_t(int, len, skb_headlen(skb));
1640 if (eat) {
1641 __skb_pull(skb, eat);
1642 len -= eat;
1643 if (!len)
1644 return 0;
1645 }
1646 eat = len;
1647 k = 0;
1648 shinfo = skb_shinfo(skb);
1649 for (i = 0; i < shinfo->nr_frags; i++) {
1650 int size = skb_frag_size(&shinfo->frags[i]);
1651
1652 if (size <= eat) {
1653 skb_frag_unref(skb, i);
1654 eat -= size;
1655 } else {
1656 shinfo->frags[k] = shinfo->frags[i];
1657 if (eat) {
1658 shinfo->frags[k].page_offset += eat;
1659 skb_frag_size_sub(&shinfo->frags[k], eat);
1660 eat = 0;
1661 }
1662 k++;
1663 }
1664 }
1665 shinfo->nr_frags = k;
1666
1667 skb->data_len -= len;
1668 skb->len = skb->data_len;
1669 return len;
1670}
1671
1672
1673int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
1674{
1675 u32 delta_truesize;
1676
1677 if (skb_unclone(skb, GFP_ATOMIC))
1678 return -ENOMEM;
1679
1680 delta_truesize = __pskb_trim_head(skb, len);
1681
1682 TCP_SKB_CB(skb)->seq += len;
1683 skb->ip_summed = CHECKSUM_PARTIAL;
1684
1685 if (delta_truesize) {
1686 skb->truesize -= delta_truesize;
1687 sk_wmem_queued_add(sk, -delta_truesize);
1688 sk_mem_uncharge(sk, delta_truesize);
1689 }
1690
1691
1692 if (tcp_skb_pcount(skb) > 1)
1693 tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
1694
1695 return 0;
1696}
1697
1698
1699static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
1700{
1701 const struct tcp_sock *tp = tcp_sk(sk);
1702 const struct inet_connection_sock *icsk = inet_csk(sk);
1703 int mss_now;
1704
1705
1706
1707
1708 mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
1709
1710
1711 if (icsk->icsk_af_ops->net_frag_header_len) {
1712 const struct dst_entry *dst = __sk_dst_get(sk);
1713
1714 if (dst && dst_allfrag(dst))
1715 mss_now -= icsk->icsk_af_ops->net_frag_header_len;
1716 }
1717
1718
1719 if (mss_now > tp->rx_opt.mss_clamp)
1720 mss_now = tp->rx_opt.mss_clamp;
1721
1722
1723 mss_now -= icsk->icsk_ext_hdr_len;
1724
1725
1726 mss_now = max(mss_now, sock_net(sk)->ipv4_sysctl_tcp_min_snd_mss);
1727 return mss_now;
1728}
1729
1730
1731int tcp_mtu_to_mss(struct sock *sk, int pmtu)
1732{
1733
1734 return __tcp_mtu_to_mss(sk, pmtu) -
1735 (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
1736}
1737
1738
1739int tcp_mss_to_mtu(struct sock *sk, int mss)
1740{
1741 const struct tcp_sock *tp = tcp_sk(sk);
1742 const struct inet_connection_sock *icsk = inet_csk(sk);
1743 int mtu;
1744
1745 mtu = mss +
1746 tp->tcp_header_len +
1747 icsk->icsk_ext_hdr_len +
1748 icsk->icsk_af_ops->net_header_len;
1749
1750
1751 if (icsk->icsk_af_ops->net_frag_header_len) {
1752 const struct dst_entry *dst = __sk_dst_get(sk);
1753
1754 if (dst && dst_allfrag(dst))
1755 mtu += icsk->icsk_af_ops->net_frag_header_len;
1756 }
1757 return mtu;
1758}
1759EXPORT_SYMBOL(tcp_mss_to_mtu);
1760
1761
1762void tcp_mtup_init(struct sock *sk)
1763{
1764 struct tcp_sock *tp = tcp_sk(sk);
1765 struct inet_connection_sock *icsk = inet_csk(sk);
1766 struct net *net = sock_net(sk);
1767
1768 icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
1769 icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
1770 icsk->icsk_af_ops->net_header_len;
1771 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
1772 icsk->icsk_mtup.probe_size = 0;
1773 if (icsk->icsk_mtup.enabled)
1774 icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
1775}
1776EXPORT_SYMBOL(tcp_mtup_init);
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
1801{
1802 struct tcp_sock *tp = tcp_sk(sk);
1803 struct inet_connection_sock *icsk = inet_csk(sk);
1804 int mss_now;
1805
1806 if (icsk->icsk_mtup.search_high > pmtu)
1807 icsk->icsk_mtup.search_high = pmtu;
1808
1809 mss_now = tcp_mtu_to_mss(sk, pmtu);
1810 mss_now = tcp_bound_to_half_wnd(tp, mss_now);
1811
1812
1813 icsk->icsk_pmtu_cookie = pmtu;
1814 if (icsk->icsk_mtup.enabled)
1815 mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
1816 tp->mss_cache = mss_now;
1817
1818 return mss_now;
1819}
1820EXPORT_SYMBOL(tcp_sync_mss);
1821
1822
1823
1824
1825unsigned int tcp_current_mss(struct sock *sk)
1826{
1827 const struct tcp_sock *tp = tcp_sk(sk);
1828 const struct dst_entry *dst = __sk_dst_get(sk);
1829 u32 mss_now;
1830 unsigned int header_len;
1831 struct tcp_out_options opts;
1832 struct tcp_md5sig_key *md5;
1833
1834 mss_now = tp->mss_cache;
1835
1836 if (dst) {
1837 u32 mtu = dst_mtu(dst);
1838 if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
1839 mss_now = tcp_sync_mss(sk, mtu);
1840 }
1841
1842 header_len = tcp_established_options(sk, NULL, &opts, &md5) +
1843 sizeof(struct tcphdr);
1844
1845
1846
1847
1848 if (header_len != tp->tcp_header_len) {
1849 int delta = (int) header_len - tp->tcp_header_len;
1850 mss_now -= delta;
1851 }
1852
1853 return mss_now;
1854}
1855
1856
1857
1858
1859
1860static void tcp_cwnd_application_limited(struct sock *sk)
1861{
1862 struct tcp_sock *tp = tcp_sk(sk);
1863
1864 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
1865 sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
1866
1867 u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
1868 u32 win_used = max(tp->snd_cwnd_used, init_win);
1869 if (win_used < tp->snd_cwnd) {
1870 tp->snd_ssthresh = tcp_current_ssthresh(sk);
1871 tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
1872 }
1873 tp->snd_cwnd_used = 0;
1874 }
1875 tp->snd_cwnd_stamp = tcp_jiffies32;
1876}
1877
1878static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1879{
1880 const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
1881 struct tcp_sock *tp = tcp_sk(sk);
1882
1883
1884
1885
1886 if (!before(tp->snd_una, tp->max_packets_seq) ||
1887 tp->packets_out > tp->max_packets_out ||
1888 is_cwnd_limited) {
1889 tp->max_packets_out = tp->packets_out;
1890 tp->max_packets_seq = tp->snd_nxt;
1891 tp->is_cwnd_limited = is_cwnd_limited;
1892 }
1893
1894 if (tcp_is_cwnd_limited(sk)) {
1895
1896 tp->snd_cwnd_used = 0;
1897 tp->snd_cwnd_stamp = tcp_jiffies32;
1898 } else {
1899
1900 if (tp->packets_out > tp->snd_cwnd_used)
1901 tp->snd_cwnd_used = tp->packets_out;
1902
1903 if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle &&
1904 (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
1905 !ca_ops->cong_control)
1906 tcp_cwnd_application_limited(sk);
1907
1908
1909
1910
1911
1912
1913
1914
1915 if (tcp_write_queue_empty(sk) && sk->sk_socket &&
1916 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
1917 (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1918 tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
1919 }
1920}
1921
1922
1923static bool tcp_minshall_check(const struct tcp_sock *tp)
1924{
1925 return after(tp->snd_sml, tp->snd_una) &&
1926 !after(tp->snd_sml, tp->snd_nxt);
1927}
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
1938 const struct sk_buff *skb)
1939{
1940 if (skb->len < tcp_skb_pcount(skb) * mss_now)
1941 tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
1942}
1943
1944
1945
1946
1947
1948
1949
1950
1951static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
1952 int nonagle)
1953{
1954 return partial &&
1955 ((nonagle & TCP_NAGLE_CORK) ||
1956 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1957}
1958
1959
1960
1961
1962static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
1963 int min_tso_segs)
1964{
1965 u32 bytes, segs;
1966
1967 bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift,
1968 sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
1969
1970
1971
1972
1973
1974
1975 segs = max_t(u32, bytes / mss_now, min_tso_segs);
1976
1977 return segs;
1978}
1979
1980
1981
1982
1983static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
1984{
1985 const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
1986 u32 min_tso, tso_segs;
1987
1988 min_tso = ca_ops->min_tso_segs ?
1989 ca_ops->min_tso_segs(sk) :
1990 sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs;
1991
1992 tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
1993 return min_t(u32, tso_segs, sk->sk_gso_max_segs);
1994}
1995
1996
1997static unsigned int tcp_mss_split_point(const struct sock *sk,
1998 const struct sk_buff *skb,
1999 unsigned int mss_now,
2000 unsigned int max_segs,
2001 int nonagle)
2002{
2003 const struct tcp_sock *tp = tcp_sk(sk);
2004 u32 partial, needed, window, max_len;
2005
2006 window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
2007 max_len = mss_now * max_segs;
2008
2009 if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
2010 return max_len;
2011
2012 needed = min(skb->len, window);
2013
2014 if (max_len <= needed)
2015 return max_len;
2016
2017 partial = needed % mss_now;
2018
2019
2020
2021
2022 if (tcp_nagle_check(partial != 0, tp, nonagle))
2023 return needed - partial;
2024
2025 return needed;
2026}
2027
2028
2029
2030
2031static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
2032 const struct sk_buff *skb)
2033{
2034 u32 in_flight, cwnd, halfcwnd;
2035
2036
2037 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
2038 tcp_skb_pcount(skb) == 1)
2039 return 1;
2040
2041 in_flight = tcp_packets_in_flight(tp);
2042 cwnd = tp->snd_cwnd;
2043 if (in_flight >= cwnd)
2044 return 0;
2045
2046
2047
2048
2049 halfcwnd = max(cwnd >> 1, 1U);
2050 return min(halfcwnd, cwnd - in_flight);
2051}
2052
2053
2054
2055
2056
2057static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
2058{
2059 int tso_segs = tcp_skb_pcount(skb);
2060
2061 if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
2062 tcp_set_skb_tso_segs(skb, mss_now);
2063 tso_segs = tcp_skb_pcount(skb);
2064 }
2065 return tso_segs;
2066}
2067
2068
2069
2070
2071
2072static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
2073 unsigned int cur_mss, int nonagle)
2074{
2075
2076
2077
2078
2079
2080
2081 if (nonagle & TCP_NAGLE_PUSH)
2082 return true;
2083
2084
2085 if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
2086 return true;
2087
2088 if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
2089 return true;
2090
2091 return false;
2092}
2093
2094
2095static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
2096 const struct sk_buff *skb,
2097 unsigned int cur_mss)
2098{
2099 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
2100
2101 if (skb->len > cur_mss)
2102 end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
2103
2104 return !after(end_seq, tcp_wnd_end(tp));
2105}
2106
2107
2108
2109
2110
2111
2112
2113
2114static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
2115 struct sk_buff *skb, unsigned int len,
2116 unsigned int mss_now, gfp_t gfp)
2117{
2118 struct sk_buff *buff;
2119 int nlen = skb->len - len;
2120 u8 flags;
2121
2122
2123 if (skb->len != skb->data_len)
2124 return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp);
2125
2126 buff = sk_stream_alloc_skb(sk, 0, gfp, true);
2127 if (unlikely(!buff))
2128 return -ENOMEM;
2129 skb_copy_decrypted(buff, skb);
2130 mptcp_skb_ext_copy(buff, skb);
2131
2132 sk_wmem_queued_add(sk, buff->truesize);
2133 sk_mem_charge(sk, buff->truesize);
2134 buff->truesize += nlen;
2135 skb->truesize -= nlen;
2136
2137
2138 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
2139 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
2140 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
2141
2142
2143 flags = TCP_SKB_CB(skb)->tcp_flags;
2144 TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
2145 TCP_SKB_CB(buff)->tcp_flags = flags;
2146
2147
2148 TCP_SKB_CB(buff)->sacked = 0;
2149
2150 tcp_skb_fragment_eor(skb, buff);
2151
2152 buff->ip_summed = CHECKSUM_PARTIAL;
2153 skb_split(skb, buff, len);
2154 tcp_fragment_tstamp(skb, buff);
2155
2156
2157 tcp_set_skb_tso_segs(skb, mss_now);
2158 tcp_set_skb_tso_segs(buff, mss_now);
2159
2160
2161 __skb_header_release(buff);
2162 tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
2163
2164 return 0;
2165}
2166
2167
2168
2169
2170
2171
2172static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
2173 bool *is_cwnd_limited,
2174 bool *is_rwnd_limited,
2175 u32 max_segs)
2176{
2177 const struct inet_connection_sock *icsk = inet_csk(sk);
2178 u32 send_win, cong_win, limit, in_flight;
2179 struct tcp_sock *tp = tcp_sk(sk);
2180 struct sk_buff *head;
2181 int win_divisor;
2182 s64 delta;
2183
2184 if (icsk->icsk_ca_state >= TCP_CA_Recovery)
2185 goto send_now;
2186
2187
2188
2189
2190 if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0)
2191 goto send_now;
2192
2193 in_flight = tcp_packets_in_flight(tp);
2194
2195 BUG_ON(tcp_skb_pcount(skb) <= 1);
2196 BUG_ON(tp->snd_cwnd <= in_flight);
2197
2198 send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
2199
2200
2201 cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
2202
2203 limit = min(send_win, cong_win);
2204
2205
2206 if (limit >= max_segs * tp->mss_cache)
2207 goto send_now;
2208
2209
2210 if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
2211 goto send_now;
2212
2213 win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
2214 if (win_divisor) {
2215 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
2216
2217
2218
2219
2220 chunk /= win_divisor;
2221 if (limit >= chunk)
2222 goto send_now;
2223 } else {
2224
2225
2226
2227
2228
2229 if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
2230 goto send_now;
2231 }
2232
2233
2234 head = tcp_rtx_queue_head(sk);
2235 if (!head)
2236 goto send_now;
2237 delta = tp->tcp_clock_cache - head->tstamp;
2238
2239 if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
2240 goto send_now;
2241
2242
2243
2244
2245
2246
2247
2248 if (cong_win < send_win) {
2249 if (cong_win <= skb->len) {
2250 *is_cwnd_limited = true;
2251 return true;
2252 }
2253 } else {
2254 if (send_win <= skb->len) {
2255 *is_rwnd_limited = true;
2256 return true;
2257 }
2258 }
2259
2260
2261 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2262 goto send_now;
2263
2264 return true;
2265
2266send_now:
2267 return false;
2268}
2269
2270static inline void tcp_mtu_check_reprobe(struct sock *sk)
2271{
2272 struct inet_connection_sock *icsk = inet_csk(sk);
2273 struct tcp_sock *tp = tcp_sk(sk);
2274 struct net *net = sock_net(sk);
2275 u32 interval;
2276 s32 delta;
2277
2278 interval = net->ipv4.sysctl_tcp_probe_interval;
2279 delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
2280 if (unlikely(delta >= interval * HZ)) {
2281 int mss = tcp_current_mss(sk);
2282
2283
2284 icsk->icsk_mtup.probe_size = 0;
2285 icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
2286 sizeof(struct tcphdr) +
2287 icsk->icsk_af_ops->net_header_len;
2288 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
2289
2290
2291 icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
2292 }
2293}
2294
2295static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
2296{
2297 struct sk_buff *skb, *next;
2298
2299 skb = tcp_send_head(sk);
2300 tcp_for_write_queue_from_safe(skb, next, sk) {
2301 if (len <= skb->len)
2302 break;
2303
2304 if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb))
2305 return false;
2306
2307 len -= skb->len;
2308 }
2309
2310 return true;
2311}
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322static int tcp_mtu_probe(struct sock *sk)
2323{
2324 struct inet_connection_sock *icsk = inet_csk(sk);
2325 struct tcp_sock *tp = tcp_sk(sk);
2326 struct sk_buff *skb, *nskb, *next;
2327 struct net *net = sock_net(sk);
2328 int probe_size;
2329 int size_needed;
2330 int copy, len;
2331 int mss_now;
2332 int interval;
2333
2334
2335
2336
2337
2338
2339 if (likely(!icsk->icsk_mtup.enabled ||
2340 icsk->icsk_mtup.probe_size ||
2341 inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
2342 tp->snd_cwnd < 11 ||
2343 tp->rx_opt.num_sacks || tp->rx_opt.dsack))
2344 return -1;
2345
2346
2347
2348
2349
2350 mss_now = tcp_current_mss(sk);
2351 probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
2352 icsk->icsk_mtup.search_low) >> 1);
2353 size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
2354 interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
2355
2356
2357
2358
2359 if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
2360 interval < net->ipv4.sysctl_tcp_probe_threshold) {
2361
2362
2363
2364 tcp_mtu_check_reprobe(sk);
2365 return -1;
2366 }
2367
2368
2369 if (tp->write_seq - tp->snd_nxt < size_needed)
2370 return -1;
2371
2372 if (tp->snd_wnd < size_needed)
2373 return -1;
2374 if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
2375 return 0;
2376
2377
2378 if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
2379 if (!tcp_packets_in_flight(tp))
2380 return -1;
2381 else
2382 return 0;
2383 }
2384
2385 if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
2386 return -1;
2387
2388
2389 nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
2390 if (!nskb)
2391 return -1;
2392 sk_wmem_queued_add(sk, nskb->truesize);
2393 sk_mem_charge(sk, nskb->truesize);
2394
2395 skb = tcp_send_head(sk);
2396 skb_copy_decrypted(nskb, skb);
2397 mptcp_skb_ext_copy(nskb, skb);
2398
2399 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
2400 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
2401 TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
2402 TCP_SKB_CB(nskb)->sacked = 0;
2403 nskb->csum = 0;
2404 nskb->ip_summed = CHECKSUM_PARTIAL;
2405
2406 tcp_insert_write_queue_before(nskb, skb, sk);
2407 tcp_highest_sack_replace(sk, skb, nskb);
2408
2409 len = 0;
2410 tcp_for_write_queue_from_safe(skb, next, sk) {
2411 copy = min_t(int, skb->len, probe_size - len);
2412 skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
2413
2414 if (skb->len <= copy) {
2415
2416
2417 TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2418
2419
2420
2421 TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
2422 tcp_skb_collapse_tstamp(nskb, skb);
2423 tcp_unlink_write_queue(skb, sk);
2424 sk_wmem_free_skb(sk, skb);
2425 } else {
2426 TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
2427 ~(TCPHDR_FIN|TCPHDR_PSH);
2428 if (!skb_shinfo(skb)->nr_frags) {
2429 skb_pull(skb, copy);
2430 } else {
2431 __pskb_trim_head(skb, copy);
2432 tcp_set_skb_tso_segs(skb, mss_now);
2433 }
2434 TCP_SKB_CB(skb)->seq += copy;
2435 }
2436
2437 len += copy;
2438
2439 if (len >= probe_size)
2440 break;
2441 }
2442 tcp_init_tso_segs(nskb, nskb->len);
2443
2444
2445
2446
2447 if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
2448
2449
2450 tp->snd_cwnd--;
2451 tcp_event_new_data_sent(sk, nskb);
2452
2453 icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
2454 tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
2455 tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
2456
2457 return 1;
2458 }
2459
2460 return -1;
2461}
2462
2463static bool tcp_pacing_check(struct sock *sk)
2464{
2465 struct tcp_sock *tp = tcp_sk(sk);
2466
2467 if (!tcp_needs_internal_pacing(sk))
2468 return false;
2469
2470 if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
2471 return false;
2472
2473 if (!hrtimer_is_queued(&tp->pacing_timer)) {
2474 hrtimer_start(&tp->pacing_timer,
2475 ns_to_ktime(tp->tcp_wstamp_ns),
2476 HRTIMER_MODE_ABS_PINNED_SOFT);
2477 sock_hold(sk);
2478 }
2479 return true;
2480}
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
2494 unsigned int factor)
2495{
2496 unsigned int limit;
2497
2498 limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift);
2499 limit = min_t(u32, limit,
2500 sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
2501 limit <<= factor;
2502
2503 if (refcount_read(&sk->sk_wmem_alloc) > limit) {
2504
2505
2506
2507
2508
2509 if (tcp_rtx_queue_empty(sk))
2510 return false;
2511
2512 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
2513
2514
2515
2516
2517 smp_mb__after_atomic();
2518 if (refcount_read(&sk->sk_wmem_alloc) > limit)
2519 return true;
2520 }
2521 return false;
2522}
2523
2524static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
2525{
2526 const u32 now = tcp_jiffies32;
2527 enum tcp_chrono old = tp->chrono_type;
2528
2529 if (old > TCP_CHRONO_UNSPEC)
2530 tp->chrono_stat[old - 1] += now - tp->chrono_start;
2531 tp->chrono_start = now;
2532 tp->chrono_type = new;
2533}
2534
2535void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
2536{
2537 struct tcp_sock *tp = tcp_sk(sk);
2538
2539
2540
2541
2542
2543
2544 if (type > tp->chrono_type)
2545 tcp_chrono_set(tp, type);
2546}
2547
2548void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
2549{
2550 struct tcp_sock *tp = tcp_sk(sk);
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560 if (tcp_rtx_and_write_queues_empty(sk))
2561 tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
2562 else if (type == tp->chrono_type)
2563 tcp_chrono_set(tp, TCP_CHRONO_BUSY);
2564}
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2581 int push_one, gfp_t gfp)
2582{
2583 struct tcp_sock *tp = tcp_sk(sk);
2584 struct sk_buff *skb;
2585 unsigned int tso_segs, sent_pkts;
2586 int cwnd_quota;
2587 int result;
2588 bool is_cwnd_limited = false, is_rwnd_limited = false;
2589 u32 max_segs;
2590
2591 sent_pkts = 0;
2592
2593 tcp_mstamp_refresh(tp);
2594 if (!push_one) {
2595
2596 result = tcp_mtu_probe(sk);
2597 if (!result) {
2598 return false;
2599 } else if (result > 0) {
2600 sent_pkts = 1;
2601 }
2602 }
2603
2604 max_segs = tcp_tso_segs(sk, mss_now);
2605 while ((skb = tcp_send_head(sk))) {
2606 unsigned int limit;
2607
2608 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
2609
2610 skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
2611 list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
2612 tcp_init_tso_segs(skb, mss_now);
2613 goto repair;
2614 }
2615
2616 if (tcp_pacing_check(sk))
2617 break;
2618
2619 tso_segs = tcp_init_tso_segs(skb, mss_now);
2620 BUG_ON(!tso_segs);
2621
2622 cwnd_quota = tcp_cwnd_test(tp, skb);
2623 if (!cwnd_quota) {
2624 if (push_one == 2)
2625
2626 cwnd_quota = 1;
2627 else
2628 break;
2629 }
2630
2631 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
2632 is_rwnd_limited = true;
2633 break;
2634 }
2635
2636 if (tso_segs == 1) {
2637 if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
2638 (tcp_skb_is_last(sk, skb) ?
2639 nonagle : TCP_NAGLE_PUSH))))
2640 break;
2641 } else {
2642 if (!push_one &&
2643 tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
2644 &is_rwnd_limited, max_segs))
2645 break;
2646 }
2647
2648 limit = mss_now;
2649 if (tso_segs > 1 && !tcp_urg_mode(tp))
2650 limit = tcp_mss_split_point(sk, skb, mss_now,
2651 min_t(unsigned int,
2652 cwnd_quota,
2653 max_segs),
2654 nonagle);
2655
2656 if (skb->len > limit &&
2657 unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
2658 skb, limit, mss_now, gfp)))
2659 break;
2660
2661 if (tcp_small_queue_check(sk, skb, 0))
2662 break;
2663
2664
2665
2666
2667
2668
2669 if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
2670 break;
2671
2672 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
2673 break;
2674
2675repair:
2676
2677
2678
2679 tcp_event_new_data_sent(sk, skb);
2680
2681 tcp_minshall_update(tp, mss_now, skb);
2682 sent_pkts += tcp_skb_pcount(skb);
2683
2684 if (push_one)
2685 break;
2686 }
2687
2688 if (is_rwnd_limited)
2689 tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
2690 else
2691 tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
2692
2693 is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
2694 if (likely(sent_pkts || is_cwnd_limited))
2695 tcp_cwnd_validate(sk, is_cwnd_limited);
2696
2697 if (likely(sent_pkts)) {
2698 if (tcp_in_cwnd_reduction(sk))
2699 tp->prr_out += sent_pkts;
2700
2701
2702 if (push_one != 2)
2703 tcp_schedule_loss_probe(sk, false);
2704 return false;
2705 }
2706 return !tp->packets_out && !tcp_write_queue_empty(sk);
2707}
2708
2709bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
2710{
2711 struct inet_connection_sock *icsk = inet_csk(sk);
2712 struct tcp_sock *tp = tcp_sk(sk);
2713 u32 timeout, rto_delta_us;
2714 int early_retrans;
2715
2716
2717
2718
2719 if (rcu_access_pointer(tp->fastopen_rsk))
2720 return false;
2721
2722 early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
2723
2724
2725
2726 if ((early_retrans != 3 && early_retrans != 4) ||
2727 !tp->packets_out || !tcp_is_sack(tp) ||
2728 (icsk->icsk_ca_state != TCP_CA_Open &&
2729 icsk->icsk_ca_state != TCP_CA_CWR))
2730 return false;
2731
2732
2733
2734
2735
2736 if (tp->srtt_us) {
2737 timeout = usecs_to_jiffies(tp->srtt_us >> 2);
2738 if (tp->packets_out == 1)
2739 timeout += TCP_RTO_MIN;
2740 else
2741 timeout += TCP_TIMEOUT_MIN;
2742 } else {
2743 timeout = TCP_TIMEOUT_INIT;
2744 }
2745
2746
2747 rto_delta_us = advancing_rto ?
2748 jiffies_to_usecs(inet_csk(sk)->icsk_rto) :
2749 tcp_rto_delta_us(sk);
2750 if (rto_delta_us > 0)
2751 timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
2752
2753 tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX);
2754 return true;
2755}
2756
2757
2758
2759
2760
2761static bool skb_still_in_host_queue(const struct sock *sk,
2762 const struct sk_buff *skb)
2763{
2764 if (unlikely(skb_fclone_busy(sk, skb))) {
2765 NET_INC_STATS(sock_net(sk),
2766 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
2767 return true;
2768 }
2769 return false;
2770}
2771
2772
2773
2774
2775void tcp_send_loss_probe(struct sock *sk)
2776{
2777 struct tcp_sock *tp = tcp_sk(sk);
2778 struct sk_buff *skb;
2779 int pcount;
2780 int mss = tcp_current_mss(sk);
2781
2782 skb = tcp_send_head(sk);
2783 if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
2784 pcount = tp->packets_out;
2785 tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
2786 if (tp->packets_out > pcount)
2787 goto probe_sent;
2788 goto rearm_timer;
2789 }
2790 skb = skb_rb_last(&sk->tcp_rtx_queue);
2791 if (unlikely(!skb)) {
2792 WARN_ONCE(tp->packets_out,
2793 "invalid inflight: %u state %u cwnd %u mss %d\n",
2794 tp->packets_out, sk->sk_state, tp->snd_cwnd, mss);
2795 inet_csk(sk)->icsk_pending = 0;
2796 return;
2797 }
2798
2799
2800 if (tp->tlp_high_seq)
2801 goto rearm_timer;
2802
2803 if (skb_still_in_host_queue(sk, skb))
2804 goto rearm_timer;
2805
2806 pcount = tcp_skb_pcount(skb);
2807 if (WARN_ON(!pcount))
2808 goto rearm_timer;
2809
2810 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
2811 if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
2812 (pcount - 1) * mss, mss,
2813 GFP_ATOMIC)))
2814 goto rearm_timer;
2815 skb = skb_rb_next(skb);
2816 }
2817
2818 if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
2819 goto rearm_timer;
2820
2821 if (__tcp_retransmit_skb(sk, skb, 1))
2822 goto rearm_timer;
2823
2824
2825 tp->tlp_high_seq = tp->snd_nxt;
2826
2827probe_sent:
2828 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
2829
2830 inet_csk(sk)->icsk_pending = 0;
2831rearm_timer:
2832 tcp_rearm_rto(sk);
2833}
2834
2835
2836
2837
2838
2839void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
2840 int nonagle)
2841{
2842
2843
2844
2845
2846 if (unlikely(sk->sk_state == TCP_CLOSE))
2847 return;
2848
2849 if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
2850 sk_gfp_mask(sk, GFP_ATOMIC)))
2851 tcp_check_probe_timer(sk);
2852}
2853
2854
2855
2856
2857void tcp_push_one(struct sock *sk, unsigned int mss_now)
2858{
2859 struct sk_buff *skb = tcp_send_head(sk);
2860
2861 BUG_ON(!skb || skb->len < mss_now);
2862
2863 tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
2864}
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918u32 __tcp_select_window(struct sock *sk)
2919{
2920 struct inet_connection_sock *icsk = inet_csk(sk);
2921 struct tcp_sock *tp = tcp_sk(sk);
2922
2923
2924
2925
2926
2927
2928 int mss = icsk->icsk_ack.rcv_mss;
2929 int free_space = tcp_space(sk);
2930 int allowed_space = tcp_full_space(sk);
2931 int full_space, window;
2932
2933 if (sk_is_mptcp(sk))
2934 mptcp_space(sk, &free_space, &allowed_space);
2935
2936 full_space = min_t(int, tp->window_clamp, allowed_space);
2937
2938 if (unlikely(mss > full_space)) {
2939 mss = full_space;
2940 if (mss <= 0)
2941 return 0;
2942 }
2943 if (free_space < (full_space >> 1)) {
2944 icsk->icsk_ack.quick = 0;
2945
2946 if (tcp_under_memory_pressure(sk))
2947 tp->rcv_ssthresh = min(tp->rcv_ssthresh,
2948 4U * tp->advmss);
2949
2950
2951
2952
2953 free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
2954
2955
2956
2957
2958
2959
2960
2961
2962 if (free_space < (allowed_space >> 4) || free_space < mss)
2963 return 0;
2964 }
2965
2966 if (free_space > tp->rcv_ssthresh)
2967 free_space = tp->rcv_ssthresh;
2968
2969
2970
2971
2972 if (tp->rx_opt.rcv_wscale) {
2973 window = free_space;
2974
2975
2976
2977
2978
2979 window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
2980 } else {
2981 window = tp->rcv_wnd;
2982
2983
2984
2985
2986
2987
2988
2989
2990 if (window <= free_space - mss || window > free_space)
2991 window = rounddown(free_space, mss);
2992 else if (mss == full_space &&
2993 free_space > window + (full_space >> 1))
2994 window = free_space;
2995 }
2996
2997 return window;
2998}
2999
3000void tcp_skb_collapse_tstamp(struct sk_buff *skb,
3001 const struct sk_buff *next_skb)
3002{
3003 if (unlikely(tcp_has_tx_tstamp(next_skb))) {
3004 const struct skb_shared_info *next_shinfo =
3005 skb_shinfo(next_skb);
3006 struct skb_shared_info *shinfo = skb_shinfo(skb);
3007
3008 shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
3009 shinfo->tskey = next_shinfo->tskey;
3010 TCP_SKB_CB(skb)->txstamp_ack |=
3011 TCP_SKB_CB(next_skb)->txstamp_ack;
3012 }
3013}
3014
3015
3016static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
3017{
3018 struct tcp_sock *tp = tcp_sk(sk);
3019 struct sk_buff *next_skb = skb_rb_next(skb);
3020 int skb_size, next_skb_size;
3021
3022 skb_size = skb->len;
3023 next_skb_size = next_skb->len;
3024
3025 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
3026
3027 if (next_skb_size) {
3028 if (next_skb_size <= skb_availroom(skb))
3029 skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
3030 next_skb_size);
3031 else if (!tcp_skb_shift(skb, next_skb, 1, next_skb_size))
3032 return false;
3033 }
3034 tcp_highest_sack_replace(sk, next_skb, skb);
3035
3036
3037 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
3038
3039
3040 TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
3041
3042
3043
3044
3045 TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
3046 TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
3047
3048
3049 tcp_clear_retrans_hints_partial(tp);
3050 if (next_skb == tp->retransmit_skb_hint)
3051 tp->retransmit_skb_hint = skb;
3052
3053 tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
3054
3055 tcp_skb_collapse_tstamp(skb, next_skb);
3056
3057 tcp_rtx_queue_unlink_and_free(next_skb, sk);
3058 return true;
3059}
3060
3061
3062static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
3063{
3064 if (tcp_skb_pcount(skb) > 1)
3065 return false;
3066 if (skb_cloned(skb))
3067 return false;
3068
3069 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
3070 return false;
3071
3072 return true;
3073}
3074
3075
3076
3077
3078static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
3079 int space)
3080{
3081 struct tcp_sock *tp = tcp_sk(sk);
3082 struct sk_buff *skb = to, *tmp;
3083 bool first = true;
3084
3085 if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)
3086 return;
3087 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
3088 return;
3089
3090 skb_rbtree_walk_from_safe(skb, tmp) {
3091 if (!tcp_can_collapse(sk, skb))
3092 break;
3093
3094 if (!tcp_skb_can_collapse(to, skb))
3095 break;
3096
3097 space -= skb->len;
3098
3099 if (first) {
3100 first = false;
3101 continue;
3102 }
3103
3104 if (space < 0)
3105 break;
3106
3107 if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
3108 break;
3109
3110 if (!tcp_collapse_retrans(sk, to))
3111 break;
3112 }
3113}
3114
3115
3116
3117
3118
3119int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
3120{
3121 struct inet_connection_sock *icsk = inet_csk(sk);
3122 struct tcp_sock *tp = tcp_sk(sk);
3123 unsigned int cur_mss;
3124 int diff, len, err;
3125
3126
3127
3128 if (icsk->icsk_mtup.probe_size)
3129 icsk->icsk_mtup.probe_size = 0;
3130
3131
3132
3133
3134 if (refcount_read(&sk->sk_wmem_alloc) >
3135 min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2),
3136 sk->sk_sndbuf))
3137 return -EAGAIN;
3138
3139 if (skb_still_in_host_queue(sk, skb))
3140 return -EBUSY;
3141
3142 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
3143 if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
3144 WARN_ON_ONCE(1);
3145 return -EINVAL;
3146 }
3147 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
3148 return -ENOMEM;
3149 }
3150
3151 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
3152 return -EHOSTUNREACH;
3153
3154 cur_mss = tcp_current_mss(sk);
3155
3156
3157
3158
3159
3160
3161 if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
3162 TCP_SKB_CB(skb)->seq != tp->snd_una)
3163 return -EAGAIN;
3164
3165 len = cur_mss * segs;
3166 if (skb->len > len) {
3167 if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
3168 cur_mss, GFP_ATOMIC))
3169 return -ENOMEM;
3170 } else {
3171 if (skb_unclone(skb, GFP_ATOMIC))
3172 return -ENOMEM;
3173
3174 diff = tcp_skb_pcount(skb);
3175 tcp_set_skb_tso_segs(skb, cur_mss);
3176 diff -= tcp_skb_pcount(skb);
3177 if (diff)
3178 tcp_adjust_pcount(sk, skb, diff);
3179 if (skb->len < cur_mss)
3180 tcp_retrans_try_collapse(sk, skb, cur_mss);
3181 }
3182
3183
3184 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
3185 tcp_ecn_clear_syn(sk, skb);
3186
3187
3188 segs = tcp_skb_pcount(skb);
3189 TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
3190 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
3191 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
3192 tp->total_retrans += segs;
3193 tp->bytes_retrans += skb->len;
3194
3195
3196
3197
3198
3199 if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
3200 skb_headroom(skb) >= 0xFFFF)) {
3201 struct sk_buff *nskb;
3202
3203 tcp_skb_tsorted_save(skb) {
3204 nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
3205 if (nskb) {
3206 nskb->dev = NULL;
3207 err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC);
3208 } else {
3209 err = -ENOBUFS;
3210 }
3211 } tcp_skb_tsorted_restore(skb);
3212
3213 if (!err) {
3214 tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
3215 tcp_rate_skb_sent(sk, skb);
3216 }
3217 } else {
3218 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
3219 }
3220
3221 if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
3222 tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
3223 TCP_SKB_CB(skb)->seq, segs, err);
3224
3225 if (likely(!err)) {
3226 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
3227 trace_tcp_retransmit_skb(sk, skb);
3228 } else if (err != -EBUSY) {
3229 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
3230 }
3231 return err;
3232}
3233
3234int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
3235{
3236 struct tcp_sock *tp = tcp_sk(sk);
3237 int err = __tcp_retransmit_skb(sk, skb, segs);
3238
3239 if (err == 0) {
3240#if FASTRETRANS_DEBUG > 0
3241 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
3242 net_dbg_ratelimited("retrans_out leaked\n");
3243 }
3244#endif
3245 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
3246 tp->retrans_out += tcp_skb_pcount(skb);
3247
3248
3249 if (!tp->retrans_stamp)
3250 tp->retrans_stamp = tcp_skb_timestamp(skb);
3251
3252 }
3253
3254 if (tp->undo_retrans < 0)
3255 tp->undo_retrans = 0;
3256 tp->undo_retrans += tcp_skb_pcount(skb);
3257 return err;
3258}
3259
3260
3261
3262
3263
3264
3265void tcp_xmit_retransmit_queue(struct sock *sk)
3266{
3267 const struct inet_connection_sock *icsk = inet_csk(sk);
3268 struct sk_buff *skb, *rtx_head, *hole = NULL;
3269 struct tcp_sock *tp = tcp_sk(sk);
3270 u32 max_segs;
3271 int mib_idx;
3272
3273 if (!tp->packets_out)
3274 return;
3275
3276 rtx_head = tcp_rtx_queue_head(sk);
3277 skb = tp->retransmit_skb_hint ?: rtx_head;
3278 max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
3279 skb_rbtree_walk_from(skb) {
3280 __u8 sacked;
3281 int segs;
3282
3283 if (tcp_pacing_check(sk))
3284 break;
3285
3286
3287 if (!hole)
3288 tp->retransmit_skb_hint = skb;
3289
3290 segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
3291 if (segs <= 0)
3292 return;
3293 sacked = TCP_SKB_CB(skb)->sacked;
3294
3295
3296
3297 segs = min_t(int, segs, max_segs);
3298
3299 if (tp->retrans_out >= tp->lost_out) {
3300 break;
3301 } else if (!(sacked & TCPCB_LOST)) {
3302 if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
3303 hole = skb;
3304 continue;
3305
3306 } else {
3307 if (icsk->icsk_ca_state != TCP_CA_Loss)
3308 mib_idx = LINUX_MIB_TCPFASTRETRANS;
3309 else
3310 mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
3311 }
3312
3313 if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
3314 continue;
3315
3316 if (tcp_small_queue_check(sk, skb, 1))
3317 return;
3318
3319 if (tcp_retransmit_skb(sk, skb, segs))
3320 return;
3321
3322 NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
3323
3324 if (tcp_in_cwnd_reduction(sk))
3325 tp->prr_out += tcp_skb_pcount(skb);
3326
3327 if (skb == rtx_head &&
3328 icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
3329 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
3330 inet_csk(sk)->icsk_rto,
3331 TCP_RTO_MAX);
3332 }
3333}
3334
3335
3336
3337
3338
3339
3340
3341
3342void sk_forced_mem_schedule(struct sock *sk, int size)
3343{
3344 int amt;
3345
3346 if (size <= sk->sk_forward_alloc)
3347 return;
3348 amt = sk_mem_pages(size);
3349 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
3350 sk_memory_allocated_add(sk, amt);
3351
3352 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3353 mem_cgroup_charge_skmem(sk->sk_memcg, amt);
3354}
3355
3356
3357
3358
3359void tcp_send_fin(struct sock *sk)
3360{
3361 struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk);
3362 struct tcp_sock *tp = tcp_sk(sk);
3363
3364
3365
3366
3367
3368
3369 tskb = tail;
3370 if (!tskb && tcp_under_memory_pressure(sk))
3371 tskb = skb_rb_last(&sk->tcp_rtx_queue);
3372
3373 if (tskb) {
3374coalesce:
3375 TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
3376 TCP_SKB_CB(tskb)->end_seq++;
3377 tp->write_seq++;
3378 if (!tail) {
3379
3380
3381
3382
3383
3384
3385 tp->snd_nxt++;
3386 return;
3387 }
3388 } else {
3389 skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
3390 if (unlikely(!skb)) {
3391 if (tskb)
3392 goto coalesce;
3393 return;
3394 }
3395 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
3396 skb_reserve(skb, MAX_TCP_HEADER);
3397 sk_forced_mem_schedule(sk, skb->truesize);
3398
3399 tcp_init_nondata_skb(skb, tp->write_seq,
3400 TCPHDR_ACK | TCPHDR_FIN);
3401 tcp_queue_skb(sk, skb);
3402 }
3403 __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
3404}
3405
3406
3407
3408
3409
3410
3411void tcp_send_active_reset(struct sock *sk, gfp_t priority)
3412{
3413 struct sk_buff *skb;
3414
3415 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
3416
3417
3418 skb = alloc_skb(MAX_TCP_HEADER, priority);
3419 if (!skb) {
3420 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
3421 return;
3422 }
3423
3424
3425 skb_reserve(skb, MAX_TCP_HEADER);
3426 tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
3427 TCPHDR_ACK | TCPHDR_RST);
3428 tcp_mstamp_refresh(tcp_sk(sk));
3429
3430 if (tcp_transmit_skb(sk, skb, 0, priority))
3431 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
3432
3433
3434
3435
3436 trace_tcp_send_reset(sk, NULL);
3437}
3438
3439
3440
3441
3442
3443
3444
3445int tcp_send_synack(struct sock *sk)
3446{
3447 struct sk_buff *skb;
3448
3449 skb = tcp_rtx_queue_head(sk);
3450 if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
3451 pr_err("%s: wrong queue state\n", __func__);
3452 return -EFAULT;
3453 }
3454 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
3455 if (skb_cloned(skb)) {
3456 struct sk_buff *nskb;
3457
3458 tcp_skb_tsorted_save(skb) {
3459 nskb = skb_copy(skb, GFP_ATOMIC);
3460 } tcp_skb_tsorted_restore(skb);
3461 if (!nskb)
3462 return -ENOMEM;
3463 INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
3464 tcp_highest_sack_replace(sk, skb, nskb);
3465 tcp_rtx_queue_unlink_and_free(skb, sk);
3466 __skb_header_release(nskb);
3467 tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
3468 sk_wmem_queued_add(sk, nskb->truesize);
3469 sk_mem_charge(sk, nskb->truesize);
3470 skb = nskb;
3471 }
3472
3473 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
3474 tcp_ecn_send_synack(sk, skb);
3475 }
3476 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
3477}
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
3490 struct request_sock *req,
3491 struct tcp_fastopen_cookie *foc,
3492 enum tcp_synack_type synack_type,
3493 struct sk_buff *syn_skb)
3494{
3495 struct inet_request_sock *ireq = inet_rsk(req);
3496 const struct tcp_sock *tp = tcp_sk(sk);
3497 struct tcp_md5sig_key *md5 = NULL;
3498 struct tcp_out_options opts;
3499 struct sk_buff *skb;
3500 int tcp_header_size;
3501 struct tcphdr *th;
3502 int mss;
3503
3504 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
3505 if (unlikely(!skb)) {
3506 dst_release(dst);
3507 return NULL;
3508 }
3509
3510 skb_reserve(skb, MAX_TCP_HEADER);
3511
3512 switch (synack_type) {
3513 case TCP_SYNACK_NORMAL:
3514 skb_set_owner_w(skb, req_to_sk(req));
3515 break;
3516 case TCP_SYNACK_COOKIE:
3517
3518
3519
3520 break;
3521 case TCP_SYNACK_FASTOPEN:
3522
3523
3524
3525
3526 skb_set_owner_w(skb, (struct sock *)sk);
3527 break;
3528 }
3529 skb_dst_set(skb, dst);
3530
3531 mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
3532
3533 memset(&opts, 0, sizeof(opts));
3534#ifdef CONFIG_SYN_COOKIES
3535 if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
3536 skb->skb_mstamp_ns = cookie_init_timestamp(req);
3537 else
3538#endif
3539 skb->skb_mstamp_ns = tcp_clock_ns();
3540
3541#ifdef CONFIG_TCP_MD5SIG
3542 rcu_read_lock();
3543 md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
3544#endif
3545 skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
3546
3547 TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK;
3548 tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
3549 foc, synack_type,
3550 syn_skb) + sizeof(*th);
3551
3552 skb_push(skb, tcp_header_size);
3553 skb_reset_transport_header(skb);
3554
3555 th = (struct tcphdr *)skb->data;
3556 memset(th, 0, sizeof(struct tcphdr));
3557 th->syn = 1;
3558 th->ack = 1;
3559 tcp_ecn_make_synack(req, th);
3560 th->source = htons(ireq->ir_num);
3561 th->dest = ireq->ir_rmt_port;
3562 skb->mark = ireq->ir_mark;
3563 skb->ip_summed = CHECKSUM_PARTIAL;
3564 th->seq = htonl(tcp_rsk(req)->snt_isn);
3565
3566 th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
3567
3568
3569 th->window = htons(min(req->rsk_rcv_wnd, 65535U));
3570 tcp_options_write((__be32 *)(th + 1), NULL, &opts);
3571 th->doff = (tcp_header_size >> 2);
3572 __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
3573
3574#ifdef CONFIG_TCP_MD5SIG
3575
3576 if (md5)
3577 tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
3578 md5, req_to_sk(req), skb);
3579 rcu_read_unlock();
3580#endif
3581
3582 bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
3583 synack_type, &opts);
3584
3585
3586 skb->tstamp = 0;
3587 return skb;
3588}
3589EXPORT_SYMBOL(tcp_make_synack);
3590
3591static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
3592{
3593 struct inet_connection_sock *icsk = inet_csk(sk);
3594 const struct tcp_congestion_ops *ca;
3595 u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
3596
3597 if (ca_key == TCP_CA_UNSPEC)
3598 return;
3599
3600 rcu_read_lock();
3601 ca = tcp_ca_find_key(ca_key);
3602 if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
3603 bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
3604 icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
3605 icsk->icsk_ca_ops = ca;
3606 }
3607 rcu_read_unlock();
3608}
3609
3610
3611static void tcp_connect_init(struct sock *sk)
3612{
3613 const struct dst_entry *dst = __sk_dst_get(sk);
3614 struct tcp_sock *tp = tcp_sk(sk);
3615 __u8 rcv_wscale;
3616 u32 rcv_wnd;
3617
3618
3619
3620
3621 tp->tcp_header_len = sizeof(struct tcphdr);
3622 if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
3623 tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
3624
3625#ifdef CONFIG_TCP_MD5SIG
3626 if (tp->af_specific->md5_lookup(sk, sk))
3627 tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
3628#endif
3629
3630
3631 if (tp->rx_opt.user_mss)
3632 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
3633 tp->max_window = 0;
3634 tcp_mtup_init(sk);
3635 tcp_sync_mss(sk, dst_mtu(dst));
3636
3637 tcp_ca_dst_init(sk, dst);
3638
3639 if (!tp->window_clamp)
3640 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
3641 tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
3642
3643 tcp_initialize_rcv_mss(sk);
3644
3645
3646 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
3647 (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
3648 tp->window_clamp = tcp_full_space(sk);
3649
3650 rcv_wnd = tcp_rwnd_init_bpf(sk);
3651 if (rcv_wnd == 0)
3652 rcv_wnd = dst_metric(dst, RTAX_INITRWND);
3653
3654 tcp_select_initial_window(sk, tcp_full_space(sk),
3655 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
3656 &tp->rcv_wnd,
3657 &tp->window_clamp,
3658 sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
3659 &rcv_wscale,
3660 rcv_wnd);
3661
3662 tp->rx_opt.rcv_wscale = rcv_wscale;
3663 tp->rcv_ssthresh = tp->rcv_wnd;
3664
3665 sk->sk_err = 0;
3666 sock_reset_flag(sk, SOCK_DONE);
3667 tp->snd_wnd = 0;
3668 tcp_init_wl(tp, 0);
3669 tcp_write_queue_purge(sk);
3670 tp->snd_una = tp->write_seq;
3671 tp->snd_sml = tp->write_seq;
3672 tp->snd_up = tp->write_seq;
3673 tp->snd_nxt = tp->write_seq;
3674
3675 if (likely(!tp->repair))
3676 tp->rcv_nxt = 0;
3677 else
3678 tp->rcv_tstamp = tcp_jiffies32;
3679 tp->rcv_wup = tp->rcv_nxt;
3680 tp->copied_seq = tp->rcv_nxt;
3681
3682 inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
3683 inet_csk(sk)->icsk_retransmits = 0;
3684 tcp_clear_retrans(tp);
3685}
3686
3687static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
3688{
3689 struct tcp_sock *tp = tcp_sk(sk);
3690 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
3691
3692 tcb->end_seq += skb->len;
3693 __skb_header_release(skb);
3694 sk_wmem_queued_add(sk, skb->truesize);
3695 sk_mem_charge(sk, skb->truesize);
3696 WRITE_ONCE(tp->write_seq, tcb->end_seq);
3697 tp->packets_out += tcp_skb_pcount(skb);
3698}
3699
3700
3701
3702
3703
3704
3705
3706
3707static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
3708{
3709 struct tcp_sock *tp = tcp_sk(sk);
3710 struct tcp_fastopen_request *fo = tp->fastopen_req;
3711 int space, err = 0;
3712 struct sk_buff *syn_data;
3713
3714 tp->rx_opt.mss_clamp = tp->advmss;
3715 if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
3716 goto fallback;
3717
3718
3719
3720
3721
3722 tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
3723
3724 space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
3725 MAX_TCP_OPTION_SPACE;
3726
3727 space = min_t(size_t, space, fo->size);
3728
3729
3730 space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
3731
3732 syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false);
3733 if (!syn_data)
3734 goto fallback;
3735 syn_data->ip_summed = CHECKSUM_PARTIAL;
3736 memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
3737 if (space) {
3738 int copied = copy_from_iter(skb_put(syn_data, space), space,
3739 &fo->data->msg_iter);
3740 if (unlikely(!copied)) {
3741 tcp_skb_tsorted_anchor_cleanup(syn_data);
3742 kfree_skb(syn_data);
3743 goto fallback;
3744 }
3745 if (copied != space) {
3746 skb_trim(syn_data, copied);
3747 space = copied;
3748 }
3749 }
3750
3751 if (space == fo->size)
3752 fo->data = NULL;
3753 fo->copied = space;
3754
3755 tcp_connect_queue_skb(sk, syn_data);
3756 if (syn_data->len)
3757 tcp_chrono_start(sk, TCP_CHRONO_BUSY);
3758
3759 err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
3760
3761 syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;
3762
3763
3764
3765
3766
3767
3768 TCP_SKB_CB(syn_data)->seq++;
3769 TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
3770 if (!err) {
3771 tp->syn_data = (fo->copied > 0);
3772 tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
3773 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
3774 goto done;
3775 }
3776
3777
3778 __skb_queue_tail(&sk->sk_write_queue, syn_data);
3779 tp->packets_out -= tcp_skb_pcount(syn_data);
3780
3781fallback:
3782
3783 if (fo->cookie.len > 0)
3784 fo->cookie.len = 0;
3785 err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
3786 if (err)
3787 tp->syn_fastopen = 0;
3788done:
3789 fo->cookie.len = -1;
3790 return err;
3791}
3792
3793
3794int tcp_connect(struct sock *sk)
3795{
3796 struct tcp_sock *tp = tcp_sk(sk);
3797 struct sk_buff *buff;
3798 int err;
3799
3800 tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
3801
3802 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
3803 return -EHOSTUNREACH;
3804
3805 tcp_connect_init(sk);
3806
3807 if (unlikely(tp->repair)) {
3808 tcp_finish_connect(sk, NULL);
3809 return 0;
3810 }
3811
3812 buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
3813 if (unlikely(!buff))
3814 return -ENOBUFS;
3815
3816 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
3817 tcp_mstamp_refresh(tp);
3818 tp->retrans_stamp = tcp_time_stamp(tp);
3819 tcp_connect_queue_skb(sk, buff);
3820 tcp_ecn_send_syn(sk, buff);
3821 tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
3822
3823
3824 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
3825 tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
3826 if (err == -ECONNREFUSED)
3827 return err;
3828
3829
3830
3831
3832 tp->snd_nxt = tp->write_seq;
3833 tp->pushed_seq = tp->write_seq;
3834 buff = tcp_send_head(sk);
3835 if (unlikely(buff)) {
3836 tp->snd_nxt = TCP_SKB_CB(buff)->seq;
3837 tp->pushed_seq = TCP_SKB_CB(buff)->seq;
3838 }
3839 TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
3840
3841
3842 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
3843 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
3844 return 0;
3845}
3846EXPORT_SYMBOL(tcp_connect);
3847
3848
3849
3850
3851
3852void tcp_send_delayed_ack(struct sock *sk)
3853{
3854 struct inet_connection_sock *icsk = inet_csk(sk);
3855 int ato = icsk->icsk_ack.ato;
3856 unsigned long timeout;
3857
3858 if (ato > TCP_DELACK_MIN) {
3859 const struct tcp_sock *tp = tcp_sk(sk);
3860 int max_ato = HZ / 2;
3861
3862 if (inet_csk_in_pingpong_mode(sk) ||
3863 (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
3864 max_ato = TCP_DELACK_MAX;
3865
3866
3867
3868
3869
3870
3871
3872 if (tp->srtt_us) {
3873 int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
3874 TCP_DELACK_MIN);
3875
3876 if (rtt < max_ato)
3877 max_ato = rtt;
3878 }
3879
3880 ato = min(ato, max_ato);
3881 }
3882
3883 ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max);
3884
3885
3886 timeout = jiffies + ato;
3887
3888
3889 if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
3890
3891
3892
3893 if (icsk->icsk_ack.blocked ||
3894 time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
3895 tcp_send_ack(sk);
3896 return;
3897 }
3898
3899 if (!time_before(timeout, icsk->icsk_ack.timeout))
3900 timeout = icsk->icsk_ack.timeout;
3901 }
3902 icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
3903 icsk->icsk_ack.timeout = timeout;
3904 sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
3905}
3906
3907
3908void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
3909{
3910 struct sk_buff *buff;
3911
3912
3913 if (sk->sk_state == TCP_CLOSE)
3914 return;
3915
3916
3917
3918
3919
3920 buff = alloc_skb(MAX_TCP_HEADER,
3921 sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
3922 if (unlikely(!buff)) {
3923 inet_csk_schedule_ack(sk);
3924 inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
3925 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
3926 TCP_DELACK_MAX, TCP_RTO_MAX);
3927 return;
3928 }
3929
3930
3931 skb_reserve(buff, MAX_TCP_HEADER);
3932 tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
3933
3934
3935
3936
3937
3938 skb_set_tcp_pure_ack(buff);
3939
3940
3941 __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
3942}
3943EXPORT_SYMBOL_GPL(__tcp_send_ack);
3944
3945void tcp_send_ack(struct sock *sk)
3946{
3947 __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
3948}
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
3962{
3963 struct tcp_sock *tp = tcp_sk(sk);
3964 struct sk_buff *skb;
3965
3966
3967 skb = alloc_skb(MAX_TCP_HEADER,
3968 sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
3969 if (!skb)
3970 return -1;
3971
3972
3973 skb_reserve(skb, MAX_TCP_HEADER);
3974
3975
3976
3977
3978 tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
3979 NET_INC_STATS(sock_net(sk), mib);
3980 return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
3981}
3982
3983
3984void tcp_send_window_probe(struct sock *sk)
3985{
3986 if (sk->sk_state == TCP_ESTABLISHED) {
3987 tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
3988 tcp_mstamp_refresh(tcp_sk(sk));
3989 tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
3990 }
3991}
3992
3993
3994int tcp_write_wakeup(struct sock *sk, int mib)
3995{
3996 struct tcp_sock *tp = tcp_sk(sk);
3997 struct sk_buff *skb;
3998
3999 if (sk->sk_state == TCP_CLOSE)
4000 return -1;
4001
4002 skb = tcp_send_head(sk);
4003 if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
4004 int err;
4005 unsigned int mss = tcp_current_mss(sk);
4006 unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
4007
4008 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
4009 tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
4010
4011
4012
4013
4014
4015 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
4016 skb->len > mss) {
4017 seg_size = min(seg_size, mss);
4018 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
4019 if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
4020 skb, seg_size, mss, GFP_ATOMIC))
4021 return -1;
4022 } else if (!tcp_skb_pcount(skb))
4023 tcp_set_skb_tso_segs(skb, mss);
4024
4025 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
4026 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
4027 if (!err)
4028 tcp_event_new_data_sent(sk, skb);
4029 return err;
4030 } else {
4031 if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
4032 tcp_xmit_probe_skb(sk, 1, mib);
4033 return tcp_xmit_probe_skb(sk, 0, mib);
4034 }
4035}
4036
4037
4038
4039
4040void tcp_send_probe0(struct sock *sk)
4041{
4042 struct inet_connection_sock *icsk = inet_csk(sk);
4043 struct tcp_sock *tp = tcp_sk(sk);
4044 struct net *net = sock_net(sk);
4045 unsigned long probe_max;
4046 int err;
4047
4048 err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
4049
4050 if (tp->packets_out || tcp_write_queue_empty(sk)) {
4051
4052 icsk->icsk_probes_out = 0;
4053 icsk->icsk_backoff = 0;
4054 return;
4055 }
4056
4057 if (err <= 0) {
4058 if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
4059 icsk->icsk_backoff++;
4060 icsk->icsk_probes_out++;
4061 probe_max = TCP_RTO_MAX;
4062 } else {
4063
4064
4065
4066
4067
4068
4069 if (!icsk->icsk_probes_out)
4070 icsk->icsk_probes_out = 1;
4071 probe_max = TCP_RESOURCE_PROBE_INTERVAL;
4072 }
4073 tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
4074 tcp_probe0_when(sk, probe_max),
4075 TCP_RTO_MAX);
4076}
4077
4078int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
4079{
4080 const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
4081 struct flowi fl;
4082 int res;
4083
4084 tcp_rsk(req)->txhash = net_tx_rndhash();
4085 res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
4086 NULL);
4087 if (!res) {
4088 __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
4089 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
4090 if (unlikely(tcp_passive_fastopen(sk)))
4091 tcp_sk(sk)->total_retrans++;
4092 trace_tcp_retransmit_synack(sk, req);
4093 }
4094 return res;
4095}
4096EXPORT_SYMBOL(tcp_rtx_synack);
4097