1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38#define pr_fmt(fmt) "TCP: " fmt
39
40#include <net/tcp.h>
41#include <net/mptcp.h>
42
43#include <linux/compiler.h>
44#include <linux/gfp.h>
45#include <linux/module.h>
46#include <linux/static_key.h>
47
48#include <trace/events/tcp.h>
49
50
51
52
53void tcp_mstamp_refresh(struct tcp_sock *tp)
54{
55 u64 val = tcp_clock_ns();
56
57 tp->tcp_clock_cache = val;
58 tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC);
59}
60
61static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
62 int push_one, gfp_t gfp);
63
64
65static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
66{
67 struct inet_connection_sock *icsk = inet_csk(sk);
68 struct tcp_sock *tp = tcp_sk(sk);
69 unsigned int prior_packets = tp->packets_out;
70
71 WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
72
73 __skb_unlink(skb, &sk->sk_write_queue);
74 tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
75
76 if (tp->highest_sack == NULL)
77 tp->highest_sack = skb;
78
79 tp->packets_out += tcp_skb_pcount(skb);
80 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
81 tcp_rearm_rto(sk);
82
83 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
84 tcp_skb_pcount(skb));
85}
86
87
88
89
90
91
92
93
94static inline __u32 tcp_acceptable_seq(const struct sock *sk)
95{
96 const struct tcp_sock *tp = tcp_sk(sk);
97
98 if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
99 (tp->rx_opt.wscale_ok &&
100 ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
101 return tp->snd_nxt;
102 else
103 return tcp_wnd_end(tp);
104}
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120static __u16 tcp_advertise_mss(struct sock *sk)
121{
122 struct tcp_sock *tp = tcp_sk(sk);
123 const struct dst_entry *dst = __sk_dst_get(sk);
124 int mss = tp->advmss;
125
126 if (dst) {
127 unsigned int metric = dst_metric_advmss(dst);
128
129 if (metric < mss) {
130 mss = metric;
131 tp->advmss = mss;
132 }
133 }
134
135 return (__u16)mss;
136}
137
138
139
140
141void tcp_cwnd_restart(struct sock *sk, s32 delta)
142{
143 struct tcp_sock *tp = tcp_sk(sk);
144 u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
145 u32 cwnd = tp->snd_cwnd;
146
147 tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
148
149 tp->snd_ssthresh = tcp_current_ssthresh(sk);
150 restart_cwnd = min(restart_cwnd, cwnd);
151
152 while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
153 cwnd >>= 1;
154 tp->snd_cwnd = max(cwnd, restart_cwnd);
155 tp->snd_cwnd_stamp = tcp_jiffies32;
156 tp->snd_cwnd_used = 0;
157}
158
159
160static void tcp_event_data_sent(struct tcp_sock *tp,
161 struct sock *sk)
162{
163 struct inet_connection_sock *icsk = inet_csk(sk);
164 const u32 now = tcp_jiffies32;
165
166 if (tcp_packets_in_flight(tp) == 0)
167 tcp_ca_event(sk, CA_EVENT_TX_START);
168
169
170
171
172
173
174 if (before(tp->lsndtime, icsk->icsk_ack.lrcvtime) &&
175 (u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
176 inet_csk_inc_pingpong_cnt(sk);
177
178 tp->lsndtime = now;
179}
180
181
182static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
183 u32 rcv_nxt)
184{
185 struct tcp_sock *tp = tcp_sk(sk);
186
187 if (unlikely(tp->compressed_ack)) {
188 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
189 tp->compressed_ack);
190 tp->compressed_ack = 0;
191 if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
192 __sock_put(sk);
193 }
194
195 if (unlikely(rcv_nxt != tp->rcv_nxt))
196 return;
197 tcp_dec_quickack_mode(sk, pkts);
198 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
199}
200
201
202
203
204
205
206
207
208void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
209 __u32 *rcv_wnd, __u32 *window_clamp,
210 int wscale_ok, __u8 *rcv_wscale,
211 __u32 init_rcv_wnd)
212{
213 unsigned int space = (__space < 0 ? 0 : __space);
214
215
216 if (*window_clamp == 0)
217 (*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
218 space = min(*window_clamp, space);
219
220
221 if (space > mss)
222 space = rounddown(space, mss);
223
224
225
226
227
228
229
230
231
232 if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
233 (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
234 else
235 (*rcv_wnd) = min_t(u32, space, U16_MAX);
236
237 if (init_rcv_wnd)
238 *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
239
240 *rcv_wscale = 0;
241 if (wscale_ok) {
242
243 space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
244 space = max_t(u32, space, sysctl_rmem_max);
245 space = min_t(u32, space, *window_clamp);
246 *rcv_wscale = clamp_t(int, ilog2(space) - 15,
247 0, TCP_MAX_WSCALE);
248 }
249
250 (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
251}
252EXPORT_SYMBOL(tcp_select_initial_window);
253
254
255
256
257
258
259static u16 tcp_select_window(struct sock *sk)
260{
261 struct tcp_sock *tp = tcp_sk(sk);
262 u32 old_win = tp->rcv_wnd;
263 u32 cur_win = tcp_receive_window(tp);
264 u32 new_win = __tcp_select_window(sk);
265
266
267 if (new_win < cur_win) {
268
269
270
271
272
273
274
275 if (new_win == 0)
276 NET_INC_STATS(sock_net(sk),
277 LINUX_MIB_TCPWANTZEROWINDOWADV);
278 new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
279 }
280 tp->rcv_wnd = new_win;
281 tp->rcv_wup = tp->rcv_nxt;
282
283
284
285
286 if (!tp->rx_opt.rcv_wscale &&
287 sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
288 new_win = min(new_win, MAX_TCP_WINDOW);
289 else
290 new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
291
292
293 new_win >>= tp->rx_opt.rcv_wscale;
294
295
296 if (new_win == 0) {
297 tp->pred_flags = 0;
298 if (old_win)
299 NET_INC_STATS(sock_net(sk),
300 LINUX_MIB_TCPTOZEROWINDOWADV);
301 } else if (old_win == 0) {
302 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
303 }
304
305 return new_win;
306}
307
308
309static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
310{
311 const struct tcp_sock *tp = tcp_sk(sk);
312
313 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
314 if (!(tp->ecn_flags & TCP_ECN_OK))
315 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
316 else if (tcp_ca_needs_ecn(sk) ||
317 tcp_bpf_ca_needs_ecn(sk))
318 INET_ECN_xmit(sk);
319}
320
321
322static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
323{
324 struct tcp_sock *tp = tcp_sk(sk);
325 bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
326 bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
327 tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
328
329 if (!use_ecn) {
330 const struct dst_entry *dst = __sk_dst_get(sk);
331
332 if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
333 use_ecn = true;
334 }
335
336 tp->ecn_flags = 0;
337
338 if (use_ecn) {
339 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
340 tp->ecn_flags = TCP_ECN_OK;
341 if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
342 INET_ECN_xmit(sk);
343 }
344}
345
346static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
347{
348 if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)
349
350
351
352 TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
353}
354
355static void
356tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
357{
358 if (inet_rsk(req)->ecn_ok)
359 th->ece = 1;
360}
361
362
363
364
365static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
366 struct tcphdr *th, int tcp_header_len)
367{
368 struct tcp_sock *tp = tcp_sk(sk);
369
370 if (tp->ecn_flags & TCP_ECN_OK) {
371
372 if (skb->len != tcp_header_len &&
373 !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
374 INET_ECN_xmit(sk);
375 if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
376 tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
377 th->cwr = 1;
378 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
379 }
380 } else if (!tcp_ca_needs_ecn(sk)) {
381
382 INET_ECN_dontxmit(sk);
383 }
384 if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
385 th->ece = 1;
386 }
387}
388
389
390
391
392static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
393{
394 skb->ip_summed = CHECKSUM_PARTIAL;
395
396 TCP_SKB_CB(skb)->tcp_flags = flags;
397 TCP_SKB_CB(skb)->sacked = 0;
398
399 tcp_skb_pcount_set(skb, 1);
400
401 TCP_SKB_CB(skb)->seq = seq;
402 if (flags & (TCPHDR_SYN | TCPHDR_FIN))
403 seq++;
404 TCP_SKB_CB(skb)->end_seq = seq;
405}
406
407static inline bool tcp_urg_mode(const struct tcp_sock *tp)
408{
409 return tp->snd_una != tp->snd_up;
410}
411
412#define OPTION_SACK_ADVERTISE (1 << 0)
413#define OPTION_TS (1 << 1)
414#define OPTION_MD5 (1 << 2)
415#define OPTION_WSCALE (1 << 3)
416#define OPTION_FAST_OPEN_COOKIE (1 << 8)
417#define OPTION_SMC (1 << 9)
418#define OPTION_MPTCP (1 << 10)
419
420static void smc_options_write(__be32 *ptr, u16 *options)
421{
422#if IS_ENABLED(CONFIG_SMC)
423 if (static_branch_unlikely(&tcp_have_smc)) {
424 if (unlikely(OPTION_SMC & *options)) {
425 *ptr++ = htonl((TCPOPT_NOP << 24) |
426 (TCPOPT_NOP << 16) |
427 (TCPOPT_EXP << 8) |
428 (TCPOLEN_EXP_SMC_BASE));
429 *ptr++ = htonl(TCPOPT_SMC_MAGIC);
430 }
431 }
432#endif
433}
434
435struct tcp_out_options {
436 u16 options;
437 u16 mss;
438 u8 ws;
439 u8 num_sack_blocks;
440 u8 hash_size;
441 __u8 *hash_location;
442 __u32 tsval, tsecr;
443 struct tcp_fastopen_cookie *fastopen_cookie;
444 struct mptcp_out_options mptcp;
445};
446
447static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts)
448{
449#if IS_ENABLED(CONFIG_MPTCP)
450 if (unlikely(OPTION_MPTCP & opts->options))
451 mptcp_write_options(ptr, &opts->mptcp);
452#endif
453}
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
469 struct tcp_out_options *opts)
470{
471 u16 options = opts->options;
472
473 if (unlikely(OPTION_MD5 & options)) {
474 *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
475 (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
476
477 opts->hash_location = (__u8 *)ptr;
478 ptr += 4;
479 }
480
481 if (unlikely(opts->mss)) {
482 *ptr++ = htonl((TCPOPT_MSS << 24) |
483 (TCPOLEN_MSS << 16) |
484 opts->mss);
485 }
486
487 if (likely(OPTION_TS & options)) {
488 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
489 *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
490 (TCPOLEN_SACK_PERM << 16) |
491 (TCPOPT_TIMESTAMP << 8) |
492 TCPOLEN_TIMESTAMP);
493 options &= ~OPTION_SACK_ADVERTISE;
494 } else {
495 *ptr++ = htonl((TCPOPT_NOP << 24) |
496 (TCPOPT_NOP << 16) |
497 (TCPOPT_TIMESTAMP << 8) |
498 TCPOLEN_TIMESTAMP);
499 }
500 *ptr++ = htonl(opts->tsval);
501 *ptr++ = htonl(opts->tsecr);
502 }
503
504 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
505 *ptr++ = htonl((TCPOPT_NOP << 24) |
506 (TCPOPT_NOP << 16) |
507 (TCPOPT_SACK_PERM << 8) |
508 TCPOLEN_SACK_PERM);
509 }
510
511 if (unlikely(OPTION_WSCALE & options)) {
512 *ptr++ = htonl((TCPOPT_NOP << 24) |
513 (TCPOPT_WINDOW << 16) |
514 (TCPOLEN_WINDOW << 8) |
515 opts->ws);
516 }
517
518 if (unlikely(opts->num_sack_blocks)) {
519 struct tcp_sack_block *sp = tp->rx_opt.dsack ?
520 tp->duplicate_sack : tp->selective_acks;
521 int this_sack;
522
523 *ptr++ = htonl((TCPOPT_NOP << 24) |
524 (TCPOPT_NOP << 16) |
525 (TCPOPT_SACK << 8) |
526 (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
527 TCPOLEN_SACK_PERBLOCK)));
528
529 for (this_sack = 0; this_sack < opts->num_sack_blocks;
530 ++this_sack) {
531 *ptr++ = htonl(sp[this_sack].start_seq);
532 *ptr++ = htonl(sp[this_sack].end_seq);
533 }
534
535 tp->rx_opt.dsack = 0;
536 }
537
538 if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
539 struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
540 u8 *p = (u8 *)ptr;
541 u32 len;
542
543 if (foc->exp) {
544 len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
545 *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
546 TCPOPT_FASTOPEN_MAGIC);
547 p += TCPOLEN_EXP_FASTOPEN_BASE;
548 } else {
549 len = TCPOLEN_FASTOPEN_BASE + foc->len;
550 *p++ = TCPOPT_FASTOPEN;
551 *p++ = len;
552 }
553
554 memcpy(p, foc->val, foc->len);
555 if ((len & 3) == 2) {
556 p[foc->len] = TCPOPT_NOP;
557 p[foc->len + 1] = TCPOPT_NOP;
558 }
559 ptr += (len + 3) >> 2;
560 }
561
562 smc_options_write(ptr, &options);
563
564 mptcp_options_write(ptr, opts);
565}
566
567static void smc_set_option(const struct tcp_sock *tp,
568 struct tcp_out_options *opts,
569 unsigned int *remaining)
570{
571#if IS_ENABLED(CONFIG_SMC)
572 if (static_branch_unlikely(&tcp_have_smc)) {
573 if (tp->syn_smc) {
574 if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
575 opts->options |= OPTION_SMC;
576 *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
577 }
578 }
579 }
580#endif
581}
582
583static void smc_set_option_cond(const struct tcp_sock *tp,
584 const struct inet_request_sock *ireq,
585 struct tcp_out_options *opts,
586 unsigned int *remaining)
587{
588#if IS_ENABLED(CONFIG_SMC)
589 if (static_branch_unlikely(&tcp_have_smc)) {
590 if (tp->syn_smc && ireq->smc_ok) {
591 if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
592 opts->options |= OPTION_SMC;
593 *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
594 }
595 }
596 }
597#endif
598}
599
600static void mptcp_set_option_cond(const struct request_sock *req,
601 struct tcp_out_options *opts,
602 unsigned int *remaining)
603{
604 if (rsk_is_mptcp(req)) {
605 unsigned int size;
606
607 if (mptcp_synack_options(req, &size, &opts->mptcp)) {
608 if (*remaining >= size) {
609 opts->options |= OPTION_MPTCP;
610 *remaining -= size;
611 }
612 }
613 }
614}
615
616
617
618
619static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
620 struct tcp_out_options *opts,
621 struct tcp_md5sig_key **md5)
622{
623 struct tcp_sock *tp = tcp_sk(sk);
624 unsigned int remaining = MAX_TCP_OPTION_SPACE;
625 struct tcp_fastopen_request *fastopen = tp->fastopen_req;
626
627 *md5 = NULL;
628#ifdef CONFIG_TCP_MD5SIG
629 if (static_branch_unlikely(&tcp_md5_needed) &&
630 rcu_access_pointer(tp->md5sig_info)) {
631 *md5 = tp->af_specific->md5_lookup(sk, sk);
632 if (*md5) {
633 opts->options |= OPTION_MD5;
634 remaining -= TCPOLEN_MD5SIG_ALIGNED;
635 }
636 }
637#endif
638
639
640
641
642
643
644
645
646
647
648 opts->mss = tcp_advertise_mss(sk);
649 remaining -= TCPOLEN_MSS_ALIGNED;
650
651 if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) {
652 opts->options |= OPTION_TS;
653 opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
654 opts->tsecr = tp->rx_opt.ts_recent;
655 remaining -= TCPOLEN_TSTAMP_ALIGNED;
656 }
657 if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) {
658 opts->ws = tp->rx_opt.rcv_wscale;
659 opts->options |= OPTION_WSCALE;
660 remaining -= TCPOLEN_WSCALE_ALIGNED;
661 }
662 if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) {
663 opts->options |= OPTION_SACK_ADVERTISE;
664 if (unlikely(!(OPTION_TS & opts->options)))
665 remaining -= TCPOLEN_SACKPERM_ALIGNED;
666 }
667
668 if (fastopen && fastopen->cookie.len >= 0) {
669 u32 need = fastopen->cookie.len;
670
671 need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
672 TCPOLEN_FASTOPEN_BASE;
673 need = (need + 3) & ~3U;
674 if (remaining >= need) {
675 opts->options |= OPTION_FAST_OPEN_COOKIE;
676 opts->fastopen_cookie = &fastopen->cookie;
677 remaining -= need;
678 tp->syn_fastopen = 1;
679 tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
680 }
681 }
682
683 smc_set_option(tp, opts, &remaining);
684
685 if (sk_is_mptcp(sk)) {
686 unsigned int size;
687
688 if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) {
689 opts->options |= OPTION_MPTCP;
690 remaining -= size;
691 }
692 }
693
694 return MAX_TCP_OPTION_SPACE - remaining;
695}
696
697
698static unsigned int tcp_synack_options(const struct sock *sk,
699 struct request_sock *req,
700 unsigned int mss, struct sk_buff *skb,
701 struct tcp_out_options *opts,
702 const struct tcp_md5sig_key *md5,
703 struct tcp_fastopen_cookie *foc,
704 enum tcp_synack_type synack_type)
705{
706 struct inet_request_sock *ireq = inet_rsk(req);
707 unsigned int remaining = MAX_TCP_OPTION_SPACE;
708
709#ifdef CONFIG_TCP_MD5SIG
710 if (md5) {
711 opts->options |= OPTION_MD5;
712 remaining -= TCPOLEN_MD5SIG_ALIGNED;
713
714
715
716
717
718
719 if (synack_type != TCP_SYNACK_COOKIE)
720 ireq->tstamp_ok &= !ireq->sack_ok;
721 }
722#endif
723
724
725 opts->mss = mss;
726 remaining -= TCPOLEN_MSS_ALIGNED;
727
728 if (likely(ireq->wscale_ok)) {
729 opts->ws = ireq->rcv_wscale;
730 opts->options |= OPTION_WSCALE;
731 remaining -= TCPOLEN_WSCALE_ALIGNED;
732 }
733 if (likely(ireq->tstamp_ok)) {
734 opts->options |= OPTION_TS;
735 opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
736 opts->tsecr = req->ts_recent;
737 remaining -= TCPOLEN_TSTAMP_ALIGNED;
738 }
739 if (likely(ireq->sack_ok)) {
740 opts->options |= OPTION_SACK_ADVERTISE;
741 if (unlikely(!ireq->tstamp_ok))
742 remaining -= TCPOLEN_SACKPERM_ALIGNED;
743 }
744 if (foc != NULL && foc->len >= 0) {
745 u32 need = foc->len;
746
747 need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
748 TCPOLEN_FASTOPEN_BASE;
749 need = (need + 3) & ~3U;
750 if (remaining >= need) {
751 opts->options |= OPTION_FAST_OPEN_COOKIE;
752 opts->fastopen_cookie = foc;
753 remaining -= need;
754 }
755 }
756
757 mptcp_set_option_cond(req, opts, &remaining);
758
759 smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
760
761 return MAX_TCP_OPTION_SPACE - remaining;
762}
763
764
765
766
767static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
768 struct tcp_out_options *opts,
769 struct tcp_md5sig_key **md5)
770{
771 struct tcp_sock *tp = tcp_sk(sk);
772 unsigned int size = 0;
773 unsigned int eff_sacks;
774
775 opts->options = 0;
776
777 *md5 = NULL;
778#ifdef CONFIG_TCP_MD5SIG
779 if (static_branch_unlikely(&tcp_md5_needed) &&
780 rcu_access_pointer(tp->md5sig_info)) {
781 *md5 = tp->af_specific->md5_lookup(sk, sk);
782 if (*md5) {
783 opts->options |= OPTION_MD5;
784 size += TCPOLEN_MD5SIG_ALIGNED;
785 }
786 }
787#endif
788
789 if (likely(tp->rx_opt.tstamp_ok)) {
790 opts->options |= OPTION_TS;
791 opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
792 opts->tsecr = tp->rx_opt.ts_recent;
793 size += TCPOLEN_TSTAMP_ALIGNED;
794 }
795
796
797
798
799
800
801
802 if (sk_is_mptcp(sk)) {
803 unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
804 unsigned int opt_size = 0;
805
806 if (mptcp_established_options(sk, skb, &opt_size, remaining,
807 &opts->mptcp)) {
808 opts->options |= OPTION_MPTCP;
809 size += opt_size;
810 }
811 }
812
813 eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
814 if (unlikely(eff_sacks)) {
815 const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
816 if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
817 TCPOLEN_SACK_PERBLOCK))
818 return size;
819
820 opts->num_sack_blocks =
821 min_t(unsigned int, eff_sacks,
822 (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
823 TCPOLEN_SACK_PERBLOCK);
824
825 size += TCPOLEN_SACK_BASE_ALIGNED +
826 opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
827 }
828
829 return size;
830}
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847struct tsq_tasklet {
848 struct tasklet_struct tasklet;
849 struct list_head head;
850};
851static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
852
853static void tcp_tsq_write(struct sock *sk)
854{
855 if ((1 << sk->sk_state) &
856 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
857 TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) {
858 struct tcp_sock *tp = tcp_sk(sk);
859
860 if (tp->lost_out > tp->retrans_out &&
861 tp->snd_cwnd > tcp_packets_in_flight(tp)) {
862 tcp_mstamp_refresh(tp);
863 tcp_xmit_retransmit_queue(sk);
864 }
865
866 tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
867 0, GFP_ATOMIC);
868 }
869}
870
871static void tcp_tsq_handler(struct sock *sk)
872{
873 bh_lock_sock(sk);
874 if (!sock_owned_by_user(sk))
875 tcp_tsq_write(sk);
876 else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
877 sock_hold(sk);
878 bh_unlock_sock(sk);
879}
880
881
882
883
884
885
886static void tcp_tasklet_func(unsigned long data)
887{
888 struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
889 LIST_HEAD(list);
890 unsigned long flags;
891 struct list_head *q, *n;
892 struct tcp_sock *tp;
893 struct sock *sk;
894
895 local_irq_save(flags);
896 list_splice_init(&tsq->head, &list);
897 local_irq_restore(flags);
898
899 list_for_each_safe(q, n, &list) {
900 tp = list_entry(q, struct tcp_sock, tsq_node);
901 list_del(&tp->tsq_node);
902
903 sk = (struct sock *)tp;
904 smp_mb__before_atomic();
905 clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
906
907 tcp_tsq_handler(sk);
908 sk_free(sk);
909 }
910}
911
912#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
913 TCPF_WRITE_TIMER_DEFERRED | \
914 TCPF_DELACK_TIMER_DEFERRED | \
915 TCPF_MTU_REDUCED_DEFERRED)
916
917
918
919
920
921
922
923void tcp_release_cb(struct sock *sk)
924{
925 unsigned long flags, nflags;
926
927
928 do {
929 flags = sk->sk_tsq_flags;
930 if (!(flags & TCP_DEFERRED_ALL))
931 return;
932 nflags = flags & ~TCP_DEFERRED_ALL;
933 } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
934
935 if (flags & TCPF_TSQ_DEFERRED) {
936 tcp_tsq_write(sk);
937 __sock_put(sk);
938 }
939
940
941
942
943
944
945
946
947
948 sock_release_ownership(sk);
949
950 if (flags & TCPF_WRITE_TIMER_DEFERRED) {
951 tcp_write_timer_handler(sk);
952 __sock_put(sk);
953 }
954 if (flags & TCPF_DELACK_TIMER_DEFERRED) {
955 tcp_delack_timer_handler(sk);
956 __sock_put(sk);
957 }
958 if (flags & TCPF_MTU_REDUCED_DEFERRED) {
959 inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
960 __sock_put(sk);
961 }
962}
963EXPORT_SYMBOL(tcp_release_cb);
964
965void __init tcp_tasklet_init(void)
966{
967 int i;
968
969 for_each_possible_cpu(i) {
970 struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
971
972 INIT_LIST_HEAD(&tsq->head);
973 tasklet_init(&tsq->tasklet,
974 tcp_tasklet_func,
975 (unsigned long)tsq);
976 }
977}
978
979
980
981
982
983
984void tcp_wfree(struct sk_buff *skb)
985{
986 struct sock *sk = skb->sk;
987 struct tcp_sock *tp = tcp_sk(sk);
988 unsigned long flags, nval, oval;
989
990
991
992
993 WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
994
995
996
997
998
999
1000
1001
1002 if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
1003 goto out;
1004
1005 for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
1006 struct tsq_tasklet *tsq;
1007 bool empty;
1008
1009 if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
1010 goto out;
1011
1012 nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
1013 nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
1014 if (nval != oval)
1015 continue;
1016
1017
1018 local_irq_save(flags);
1019 tsq = this_cpu_ptr(&tsq_tasklet);
1020 empty = list_empty(&tsq->head);
1021 list_add(&tp->tsq_node, &tsq->head);
1022 if (empty)
1023 tasklet_schedule(&tsq->tasklet);
1024 local_irq_restore(flags);
1025 return;
1026 }
1027out:
1028 sk_free(sk);
1029}
1030
1031
1032
1033
1034enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
1035{
1036 struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
1037 struct sock *sk = (struct sock *)tp;
1038
1039 tcp_tsq_handler(sk);
1040 sock_put(sk);
1041
1042 return HRTIMER_NORESTART;
1043}
1044
1045static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
1046 u64 prior_wstamp)
1047{
1048 struct tcp_sock *tp = tcp_sk(sk);
1049
1050 if (sk->sk_pacing_status != SK_PACING_NONE) {
1051 unsigned long rate = sk->sk_pacing_rate;
1052
1053
1054
1055
1056
1057 if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
1058 u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
1059 u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
1060
1061
1062 len_ns -= min_t(u64, len_ns / 2, credit);
1063 tp->tcp_wstamp_ns += len_ns;
1064 }
1065 }
1066 list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
1067}
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
1081 int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
1082{
1083 const struct inet_connection_sock *icsk = inet_csk(sk);
1084 struct inet_sock *inet;
1085 struct tcp_sock *tp;
1086 struct tcp_skb_cb *tcb;
1087 struct tcp_out_options opts;
1088 unsigned int tcp_options_size, tcp_header_size;
1089 struct sk_buff *oskb = NULL;
1090 struct tcp_md5sig_key *md5;
1091 struct tcphdr *th;
1092 u64 prior_wstamp;
1093 int err;
1094
1095 BUG_ON(!skb || !tcp_skb_pcount(skb));
1096 tp = tcp_sk(sk);
1097 prior_wstamp = tp->tcp_wstamp_ns;
1098 tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
1099 skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
1100 if (clone_it) {
1101 TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
1102 - tp->snd_una;
1103 oskb = skb;
1104
1105 tcp_skb_tsorted_save(oskb) {
1106 if (unlikely(skb_cloned(oskb)))
1107 skb = pskb_copy(oskb, gfp_mask);
1108 else
1109 skb = skb_clone(oskb, gfp_mask);
1110 } tcp_skb_tsorted_restore(oskb);
1111
1112 if (unlikely(!skb))
1113 return -ENOBUFS;
1114
1115
1116
1117 skb->dev = NULL;
1118 }
1119
1120 inet = inet_sk(sk);
1121 tcb = TCP_SKB_CB(skb);
1122 memset(&opts, 0, sizeof(opts));
1123
1124 if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
1125 tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
1126 } else {
1127 tcp_options_size = tcp_established_options(sk, skb, &opts,
1128 &md5);
1129
1130
1131
1132
1133
1134
1135
1136
1137 if (tcp_skb_pcount(skb) > 1)
1138 tcb->tcp_flags |= TCPHDR_PSH;
1139 }
1140 tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
1141
1142
1143
1144
1145
1146
1147
1148
1149 skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
1150
1151
1152
1153
1154
1155
1156 skb->pfmemalloc = 0;
1157
1158 skb_push(skb, tcp_header_size);
1159 skb_reset_transport_header(skb);
1160
1161 skb_orphan(skb);
1162 skb->sk = sk;
1163 skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
1164 skb_set_hash_from_sk(skb, sk);
1165 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1166
1167 skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
1168
1169
1170 th = (struct tcphdr *)skb->data;
1171 th->source = inet->inet_sport;
1172 th->dest = inet->inet_dport;
1173 th->seq = htonl(tcb->seq);
1174 th->ack_seq = htonl(rcv_nxt);
1175 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
1176 tcb->tcp_flags);
1177
1178 th->check = 0;
1179 th->urg_ptr = 0;
1180
1181
1182 if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
1183 if (before(tp->snd_up, tcb->seq + 0x10000)) {
1184 th->urg_ptr = htons(tp->snd_up - tcb->seq);
1185 th->urg = 1;
1186 } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
1187 th->urg_ptr = htons(0xFFFF);
1188 th->urg = 1;
1189 }
1190 }
1191
1192 tcp_options_write((__be32 *)(th + 1), tp, &opts);
1193 skb_shinfo(skb)->gso_type = sk->sk_gso_type;
1194 if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
1195 th->window = htons(tcp_select_window(sk));
1196 tcp_ecn_send(sk, skb, th, tcp_header_size);
1197 } else {
1198
1199
1200
1201 th->window = htons(min(tp->rcv_wnd, 65535U));
1202 }
1203#ifdef CONFIG_TCP_MD5SIG
1204
1205 if (md5) {
1206 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1207 tp->af_specific->calc_md5_hash(opts.hash_location,
1208 md5, sk, skb);
1209 }
1210#endif
1211
1212 icsk->icsk_af_ops->send_check(sk, skb);
1213
1214 if (likely(tcb->tcp_flags & TCPHDR_ACK))
1215 tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
1216
1217 if (skb->len != tcp_header_size) {
1218 tcp_event_data_sent(tp, sk);
1219 tp->data_segs_out += tcp_skb_pcount(skb);
1220 tp->bytes_sent += skb->len - tcp_header_size;
1221 }
1222
1223 if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
1224 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
1225 tcp_skb_pcount(skb));
1226
1227 tp->segs_out += tcp_skb_pcount(skb);
1228
1229 skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
1230 skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
1231
1232
1233
1234
1235 memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
1236 sizeof(struct inet6_skb_parm)));
1237
1238 tcp_add_tx_delay(skb, tp);
1239
1240 err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
1241
1242 if (unlikely(err > 0)) {
1243 tcp_enter_cwr(sk);
1244 err = net_xmit_eval(err);
1245 }
1246 if (!err && oskb) {
1247 tcp_update_skb_after_send(sk, oskb, prior_wstamp);
1248 tcp_rate_skb_sent(sk, oskb);
1249 }
1250 return err;
1251}
1252
1253static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1254 gfp_t gfp_mask)
1255{
1256 return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
1257 tcp_sk(sk)->rcv_nxt);
1258}
1259
1260
1261
1262
1263
1264
1265static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
1266{
1267 struct tcp_sock *tp = tcp_sk(sk);
1268
1269
1270 WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
1271 __skb_header_release(skb);
1272 tcp_add_write_queue_tail(sk, skb);
1273 sk_wmem_queued_add(sk, skb->truesize);
1274 sk_mem_charge(sk, skb->truesize);
1275}
1276
1277
1278static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
1279{
1280 if (skb->len <= mss_now) {
1281
1282
1283
1284 tcp_skb_pcount_set(skb, 1);
1285 TCP_SKB_CB(skb)->tcp_gso_size = 0;
1286 } else {
1287 tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
1288 TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
1289 }
1290}
1291
1292
1293
1294
1295static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
1296{
1297 struct tcp_sock *tp = tcp_sk(sk);
1298
1299 tp->packets_out -= decr;
1300
1301 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1302 tp->sacked_out -= decr;
1303 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
1304 tp->retrans_out -= decr;
1305 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
1306 tp->lost_out -= decr;
1307
1308
1309 if (tcp_is_reno(tp) && decr > 0)
1310 tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
1311
1312 if (tp->lost_skb_hint &&
1313 before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
1314 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
1315 tp->lost_cnt_hint -= decr;
1316
1317 tcp_verify_left_out(tp);
1318}
1319
1320static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
1321{
1322 return TCP_SKB_CB(skb)->txstamp_ack ||
1323 (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
1324}
1325
1326static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
1327{
1328 struct skb_shared_info *shinfo = skb_shinfo(skb);
1329
1330 if (unlikely(tcp_has_tx_tstamp(skb)) &&
1331 !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
1332 struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
1333 u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
1334
1335 shinfo->tx_flags &= ~tsflags;
1336 shinfo2->tx_flags |= tsflags;
1337 swap(shinfo->tskey, shinfo2->tskey);
1338 TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
1339 TCP_SKB_CB(skb)->txstamp_ack = 0;
1340 }
1341}
1342
1343static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
1344{
1345 TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
1346 TCP_SKB_CB(skb)->eor = 0;
1347}
1348
1349
1350static void tcp_insert_write_queue_after(struct sk_buff *skb,
1351 struct sk_buff *buff,
1352 struct sock *sk,
1353 enum tcp_queue tcp_queue)
1354{
1355 if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
1356 __skb_queue_after(&sk->sk_write_queue, skb, buff);
1357 else
1358 tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
1359}
1360
1361
1362
1363
1364
1365
1366int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
1367 struct sk_buff *skb, u32 len,
1368 unsigned int mss_now, gfp_t gfp)
1369{
1370 struct tcp_sock *tp = tcp_sk(sk);
1371 struct sk_buff *buff;
1372 int nsize, old_factor;
1373 long limit;
1374 int nlen;
1375 u8 flags;
1376
1377 if (WARN_ON(len > skb->len))
1378 return -EINVAL;
1379
1380 nsize = skb_headlen(skb) - len;
1381 if (nsize < 0)
1382 nsize = 0;
1383
1384
1385
1386
1387
1388
1389 limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_MAX_SIZE);
1390 if (unlikely((sk->sk_wmem_queued >> 1) > limit &&
1391 tcp_queue != TCP_FRAG_IN_WRITE_QUEUE &&
1392 skb != tcp_rtx_queue_head(sk) &&
1393 skb != tcp_rtx_queue_tail(sk))) {
1394 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
1395 return -ENOMEM;
1396 }
1397
1398 if (skb_unclone(skb, gfp))
1399 return -ENOMEM;
1400
1401
1402 buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
1403 if (!buff)
1404 return -ENOMEM;
1405 skb_copy_decrypted(buff, skb);
1406
1407 sk_wmem_queued_add(sk, buff->truesize);
1408 sk_mem_charge(sk, buff->truesize);
1409 nlen = skb->len - len - nsize;
1410 buff->truesize += nlen;
1411 skb->truesize -= nlen;
1412
1413
1414 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
1415 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
1416 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
1417
1418
1419 flags = TCP_SKB_CB(skb)->tcp_flags;
1420 TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
1421 TCP_SKB_CB(buff)->tcp_flags = flags;
1422 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
1423 tcp_skb_fragment_eor(skb, buff);
1424
1425 skb_split(skb, buff, len);
1426
1427 buff->ip_summed = CHECKSUM_PARTIAL;
1428
1429 buff->tstamp = skb->tstamp;
1430 tcp_fragment_tstamp(skb, buff);
1431
1432 old_factor = tcp_skb_pcount(skb);
1433
1434
1435 tcp_set_skb_tso_segs(skb, mss_now);
1436 tcp_set_skb_tso_segs(buff, mss_now);
1437
1438
1439 TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
1440
1441
1442
1443
1444 if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
1445 int diff = old_factor - tcp_skb_pcount(skb) -
1446 tcp_skb_pcount(buff);
1447
1448 if (diff)
1449 tcp_adjust_pcount(sk, skb, diff);
1450 }
1451
1452
1453 __skb_header_release(buff);
1454 tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
1455 if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
1456 list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
1457
1458 return 0;
1459}
1460
1461
1462
1463
1464static int __pskb_trim_head(struct sk_buff *skb, int len)
1465{
1466 struct skb_shared_info *shinfo;
1467 int i, k, eat;
1468
1469 eat = min_t(int, len, skb_headlen(skb));
1470 if (eat) {
1471 __skb_pull(skb, eat);
1472 len -= eat;
1473 if (!len)
1474 return 0;
1475 }
1476 eat = len;
1477 k = 0;
1478 shinfo = skb_shinfo(skb);
1479 for (i = 0; i < shinfo->nr_frags; i++) {
1480 int size = skb_frag_size(&shinfo->frags[i]);
1481
1482 if (size <= eat) {
1483 skb_frag_unref(skb, i);
1484 eat -= size;
1485 } else {
1486 shinfo->frags[k] = shinfo->frags[i];
1487 if (eat) {
1488 skb_frag_off_add(&shinfo->frags[k], eat);
1489 skb_frag_size_sub(&shinfo->frags[k], eat);
1490 eat = 0;
1491 }
1492 k++;
1493 }
1494 }
1495 shinfo->nr_frags = k;
1496
1497 skb->data_len -= len;
1498 skb->len = skb->data_len;
1499 return len;
1500}
1501
1502
1503int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
1504{
1505 u32 delta_truesize;
1506
1507 if (skb_unclone(skb, GFP_ATOMIC))
1508 return -ENOMEM;
1509
1510 delta_truesize = __pskb_trim_head(skb, len);
1511
1512 TCP_SKB_CB(skb)->seq += len;
1513 skb->ip_summed = CHECKSUM_PARTIAL;
1514
1515 if (delta_truesize) {
1516 skb->truesize -= delta_truesize;
1517 sk_wmem_queued_add(sk, -delta_truesize);
1518 sk_mem_uncharge(sk, delta_truesize);
1519 sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
1520 }
1521
1522
1523 if (tcp_skb_pcount(skb) > 1)
1524 tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
1525
1526 return 0;
1527}
1528
1529
1530static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
1531{
1532 const struct tcp_sock *tp = tcp_sk(sk);
1533 const struct inet_connection_sock *icsk = inet_csk(sk);
1534 int mss_now;
1535
1536
1537
1538
1539 mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
1540
1541
1542 if (icsk->icsk_af_ops->net_frag_header_len) {
1543 const struct dst_entry *dst = __sk_dst_get(sk);
1544
1545 if (dst && dst_allfrag(dst))
1546 mss_now -= icsk->icsk_af_ops->net_frag_header_len;
1547 }
1548
1549
1550 if (mss_now > tp->rx_opt.mss_clamp)
1551 mss_now = tp->rx_opt.mss_clamp;
1552
1553
1554 mss_now -= icsk->icsk_ext_hdr_len;
1555
1556
1557 mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss);
1558 return mss_now;
1559}
1560
1561
1562int tcp_mtu_to_mss(struct sock *sk, int pmtu)
1563{
1564
1565 return __tcp_mtu_to_mss(sk, pmtu) -
1566 (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
1567}
1568
1569
1570int tcp_mss_to_mtu(struct sock *sk, int mss)
1571{
1572 const struct tcp_sock *tp = tcp_sk(sk);
1573 const struct inet_connection_sock *icsk = inet_csk(sk);
1574 int mtu;
1575
1576 mtu = mss +
1577 tp->tcp_header_len +
1578 icsk->icsk_ext_hdr_len +
1579 icsk->icsk_af_ops->net_header_len;
1580
1581
1582 if (icsk->icsk_af_ops->net_frag_header_len) {
1583 const struct dst_entry *dst = __sk_dst_get(sk);
1584
1585 if (dst && dst_allfrag(dst))
1586 mtu += icsk->icsk_af_ops->net_frag_header_len;
1587 }
1588 return mtu;
1589}
1590EXPORT_SYMBOL(tcp_mss_to_mtu);
1591
1592
1593void tcp_mtup_init(struct sock *sk)
1594{
1595 struct tcp_sock *tp = tcp_sk(sk);
1596 struct inet_connection_sock *icsk = inet_csk(sk);
1597 struct net *net = sock_net(sk);
1598
1599 icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
1600 icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
1601 icsk->icsk_af_ops->net_header_len;
1602 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
1603 icsk->icsk_mtup.probe_size = 0;
1604 if (icsk->icsk_mtup.enabled)
1605 icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
1606}
1607EXPORT_SYMBOL(tcp_mtup_init);
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
1632{
1633 struct tcp_sock *tp = tcp_sk(sk);
1634 struct inet_connection_sock *icsk = inet_csk(sk);
1635 int mss_now;
1636
1637 if (icsk->icsk_mtup.search_high > pmtu)
1638 icsk->icsk_mtup.search_high = pmtu;
1639
1640 mss_now = tcp_mtu_to_mss(sk, pmtu);
1641 mss_now = tcp_bound_to_half_wnd(tp, mss_now);
1642
1643
1644 icsk->icsk_pmtu_cookie = pmtu;
1645 if (icsk->icsk_mtup.enabled)
1646 mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
1647 tp->mss_cache = mss_now;
1648
1649 return mss_now;
1650}
1651EXPORT_SYMBOL(tcp_sync_mss);
1652
1653
1654
1655
1656unsigned int tcp_current_mss(struct sock *sk)
1657{
1658 const struct tcp_sock *tp = tcp_sk(sk);
1659 const struct dst_entry *dst = __sk_dst_get(sk);
1660 u32 mss_now;
1661 unsigned int header_len;
1662 struct tcp_out_options opts;
1663 struct tcp_md5sig_key *md5;
1664
1665 mss_now = tp->mss_cache;
1666
1667 if (dst) {
1668 u32 mtu = dst_mtu(dst);
1669 if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
1670 mss_now = tcp_sync_mss(sk, mtu);
1671 }
1672
1673 header_len = tcp_established_options(sk, NULL, &opts, &md5) +
1674 sizeof(struct tcphdr);
1675
1676
1677
1678
1679 if (header_len != tp->tcp_header_len) {
1680 int delta = (int) header_len - tp->tcp_header_len;
1681 mss_now -= delta;
1682 }
1683
1684 return mss_now;
1685}
1686
1687
1688
1689
1690
1691static void tcp_cwnd_application_limited(struct sock *sk)
1692{
1693 struct tcp_sock *tp = tcp_sk(sk);
1694
1695 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
1696 sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
1697
1698 u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
1699 u32 win_used = max(tp->snd_cwnd_used, init_win);
1700 if (win_used < tp->snd_cwnd) {
1701 tp->snd_ssthresh = tcp_current_ssthresh(sk);
1702 tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
1703 }
1704 tp->snd_cwnd_used = 0;
1705 }
1706 tp->snd_cwnd_stamp = tcp_jiffies32;
1707}
1708
1709static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1710{
1711 const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
1712 struct tcp_sock *tp = tcp_sk(sk);
1713
1714
1715
1716
1717 if (!before(tp->snd_una, tp->max_packets_seq) ||
1718 tp->packets_out > tp->max_packets_out) {
1719 tp->max_packets_out = tp->packets_out;
1720 tp->max_packets_seq = tp->snd_nxt;
1721 tp->is_cwnd_limited = is_cwnd_limited;
1722 }
1723
1724 if (tcp_is_cwnd_limited(sk)) {
1725
1726 tp->snd_cwnd_used = 0;
1727 tp->snd_cwnd_stamp = tcp_jiffies32;
1728 } else {
1729
1730 if (tp->packets_out > tp->snd_cwnd_used)
1731 tp->snd_cwnd_used = tp->packets_out;
1732
1733 if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle &&
1734 (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
1735 !ca_ops->cong_control)
1736 tcp_cwnd_application_limited(sk);
1737
1738
1739
1740
1741
1742
1743
1744
1745 if (tcp_write_queue_empty(sk) && sk->sk_socket &&
1746 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
1747 (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1748 tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
1749 }
1750}
1751
1752
1753static bool tcp_minshall_check(const struct tcp_sock *tp)
1754{
1755 return after(tp->snd_sml, tp->snd_una) &&
1756 !after(tp->snd_sml, tp->snd_nxt);
1757}
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
1768 const struct sk_buff *skb)
1769{
1770 if (skb->len < tcp_skb_pcount(skb) * mss_now)
1771 tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
1772}
1773
1774
1775
1776
1777
1778
1779
1780
1781static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
1782 int nonagle)
1783{
1784 return partial &&
1785 ((nonagle & TCP_NAGLE_CORK) ||
1786 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1787}
1788
1789
1790
1791
1792static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
1793 int min_tso_segs)
1794{
1795 u32 bytes, segs;
1796
1797 bytes = min_t(unsigned long,
1798 sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
1799 sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
1800
1801
1802
1803
1804
1805
1806 segs = max_t(u32, bytes / mss_now, min_tso_segs);
1807
1808 return segs;
1809}
1810
1811
1812
1813
1814static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
1815{
1816 const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
1817 u32 min_tso, tso_segs;
1818
1819 min_tso = ca_ops->min_tso_segs ?
1820 ca_ops->min_tso_segs(sk) :
1821 sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs;
1822
1823 tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
1824 return min_t(u32, tso_segs, sk->sk_gso_max_segs);
1825}
1826
1827
1828static unsigned int tcp_mss_split_point(const struct sock *sk,
1829 const struct sk_buff *skb,
1830 unsigned int mss_now,
1831 unsigned int max_segs,
1832 int nonagle)
1833{
1834 const struct tcp_sock *tp = tcp_sk(sk);
1835 u32 partial, needed, window, max_len;
1836
1837 window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1838 max_len = mss_now * max_segs;
1839
1840 if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
1841 return max_len;
1842
1843 needed = min(skb->len, window);
1844
1845 if (max_len <= needed)
1846 return max_len;
1847
1848 partial = needed % mss_now;
1849
1850
1851
1852
1853 if (tcp_nagle_check(partial != 0, tp, nonagle))
1854 return needed - partial;
1855
1856 return needed;
1857}
1858
1859
1860
1861
1862static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
1863 const struct sk_buff *skb)
1864{
1865 u32 in_flight, cwnd, halfcwnd;
1866
1867
1868 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
1869 tcp_skb_pcount(skb) == 1)
1870 return 1;
1871
1872 in_flight = tcp_packets_in_flight(tp);
1873 cwnd = tp->snd_cwnd;
1874 if (in_flight >= cwnd)
1875 return 0;
1876
1877
1878
1879
1880 halfcwnd = max(cwnd >> 1, 1U);
1881 return min(halfcwnd, cwnd - in_flight);
1882}
1883
1884
1885
1886
1887
1888static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
1889{
1890 int tso_segs = tcp_skb_pcount(skb);
1891
1892 if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
1893 tcp_set_skb_tso_segs(skb, mss_now);
1894 tso_segs = tcp_skb_pcount(skb);
1895 }
1896 return tso_segs;
1897}
1898
1899
1900
1901
1902
1903static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
1904 unsigned int cur_mss, int nonagle)
1905{
1906
1907
1908
1909
1910
1911
1912 if (nonagle & TCP_NAGLE_PUSH)
1913 return true;
1914
1915
1916 if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
1917 return true;
1918
1919 if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
1920 return true;
1921
1922 return false;
1923}
1924
1925
1926static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
1927 const struct sk_buff *skb,
1928 unsigned int cur_mss)
1929{
1930 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
1931
1932 if (skb->len > cur_mss)
1933 end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
1934
1935 return !after(end_seq, tcp_wnd_end(tp));
1936}
1937
1938
1939
1940
1941
1942
1943
1944
1945static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1946 unsigned int mss_now, gfp_t gfp)
1947{
1948 int nlen = skb->len - len;
1949 struct sk_buff *buff;
1950 u8 flags;
1951
1952
1953 if (skb->len != skb->data_len)
1954 return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
1955 skb, len, mss_now, gfp);
1956
1957 buff = sk_stream_alloc_skb(sk, 0, gfp, true);
1958 if (unlikely(!buff))
1959 return -ENOMEM;
1960 skb_copy_decrypted(buff, skb);
1961
1962 sk_wmem_queued_add(sk, buff->truesize);
1963 sk_mem_charge(sk, buff->truesize);
1964 buff->truesize += nlen;
1965 skb->truesize -= nlen;
1966
1967
1968 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
1969 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
1970 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
1971
1972
1973 flags = TCP_SKB_CB(skb)->tcp_flags;
1974 TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
1975 TCP_SKB_CB(buff)->tcp_flags = flags;
1976
1977
1978 TCP_SKB_CB(buff)->sacked = 0;
1979
1980 tcp_skb_fragment_eor(skb, buff);
1981
1982 buff->ip_summed = CHECKSUM_PARTIAL;
1983 skb_split(skb, buff, len);
1984 tcp_fragment_tstamp(skb, buff);
1985
1986
1987 tcp_set_skb_tso_segs(skb, mss_now);
1988 tcp_set_skb_tso_segs(buff, mss_now);
1989
1990
1991 __skb_header_release(buff);
1992 tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE);
1993
1994 return 0;
1995}
1996
1997
1998
1999
2000
2001
2002static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
2003 bool *is_cwnd_limited,
2004 bool *is_rwnd_limited,
2005 u32 max_segs)
2006{
2007 const struct inet_connection_sock *icsk = inet_csk(sk);
2008 u32 send_win, cong_win, limit, in_flight;
2009 struct tcp_sock *tp = tcp_sk(sk);
2010 struct sk_buff *head;
2011 int win_divisor;
2012 s64 delta;
2013
2014 if (icsk->icsk_ca_state >= TCP_CA_Recovery)
2015 goto send_now;
2016
2017
2018
2019
2020
2021
2022 delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
2023 if (delta > 0)
2024 goto send_now;
2025
2026 in_flight = tcp_packets_in_flight(tp);
2027
2028 BUG_ON(tcp_skb_pcount(skb) <= 1);
2029 BUG_ON(tp->snd_cwnd <= in_flight);
2030
2031 send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
2032
2033
2034 cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
2035
2036 limit = min(send_win, cong_win);
2037
2038
2039 if (limit >= max_segs * tp->mss_cache)
2040 goto send_now;
2041
2042
2043 if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
2044 goto send_now;
2045
2046 win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
2047 if (win_divisor) {
2048 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
2049
2050
2051
2052
2053 chunk /= win_divisor;
2054 if (limit >= chunk)
2055 goto send_now;
2056 } else {
2057
2058
2059
2060
2061
2062 if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
2063 goto send_now;
2064 }
2065
2066
2067 head = tcp_rtx_queue_head(sk);
2068 if (!head)
2069 goto send_now;
2070 delta = tp->tcp_clock_cache - head->tstamp;
2071
2072 if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
2073 goto send_now;
2074
2075
2076
2077
2078
2079
2080
2081 if (cong_win < send_win) {
2082 if (cong_win <= skb->len) {
2083 *is_cwnd_limited = true;
2084 return true;
2085 }
2086 } else {
2087 if (send_win <= skb->len) {
2088 *is_rwnd_limited = true;
2089 return true;
2090 }
2091 }
2092
2093
2094 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
2095 TCP_SKB_CB(skb)->eor)
2096 goto send_now;
2097
2098 return true;
2099
2100send_now:
2101 return false;
2102}
2103
2104static inline void tcp_mtu_check_reprobe(struct sock *sk)
2105{
2106 struct inet_connection_sock *icsk = inet_csk(sk);
2107 struct tcp_sock *tp = tcp_sk(sk);
2108 struct net *net = sock_net(sk);
2109 u32 interval;
2110 s32 delta;
2111
2112 interval = net->ipv4.sysctl_tcp_probe_interval;
2113 delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
2114 if (unlikely(delta >= interval * HZ)) {
2115 int mss = tcp_current_mss(sk);
2116
2117
2118 icsk->icsk_mtup.probe_size = 0;
2119 icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
2120 sizeof(struct tcphdr) +
2121 icsk->icsk_af_ops->net_header_len;
2122 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
2123
2124
2125 icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
2126 }
2127}
2128
2129static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
2130{
2131 struct sk_buff *skb, *next;
2132
2133 skb = tcp_send_head(sk);
2134 tcp_for_write_queue_from_safe(skb, next, sk) {
2135 if (len <= skb->len)
2136 break;
2137
2138 if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb))
2139 return false;
2140
2141 len -= skb->len;
2142 }
2143
2144 return true;
2145}
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156static int tcp_mtu_probe(struct sock *sk)
2157{
2158 struct inet_connection_sock *icsk = inet_csk(sk);
2159 struct tcp_sock *tp = tcp_sk(sk);
2160 struct sk_buff *skb, *nskb, *next;
2161 struct net *net = sock_net(sk);
2162 int probe_size;
2163 int size_needed;
2164 int copy, len;
2165 int mss_now;
2166 int interval;
2167
2168
2169
2170
2171
2172
2173 if (likely(!icsk->icsk_mtup.enabled ||
2174 icsk->icsk_mtup.probe_size ||
2175 inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
2176 tp->snd_cwnd < 11 ||
2177 tp->rx_opt.num_sacks || tp->rx_opt.dsack))
2178 return -1;
2179
2180
2181
2182
2183
2184 mss_now = tcp_current_mss(sk);
2185 probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
2186 icsk->icsk_mtup.search_low) >> 1);
2187 size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
2188 interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
2189
2190
2191
2192
2193 if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
2194 interval < net->ipv4.sysctl_tcp_probe_threshold) {
2195
2196
2197
2198 tcp_mtu_check_reprobe(sk);
2199 return -1;
2200 }
2201
2202
2203 if (tp->write_seq - tp->snd_nxt < size_needed)
2204 return -1;
2205
2206 if (tp->snd_wnd < size_needed)
2207 return -1;
2208 if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
2209 return 0;
2210
2211
2212 if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
2213 if (!tcp_packets_in_flight(tp))
2214 return -1;
2215 else
2216 return 0;
2217 }
2218
2219 if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
2220 return -1;
2221
2222
2223 nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
2224 if (!nskb)
2225 return -1;
2226 sk_wmem_queued_add(sk, nskb->truesize);
2227 sk_mem_charge(sk, nskb->truesize);
2228
2229 skb = tcp_send_head(sk);
2230 skb_copy_decrypted(nskb, skb);
2231
2232 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
2233 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
2234 TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
2235 TCP_SKB_CB(nskb)->sacked = 0;
2236 nskb->csum = 0;
2237 nskb->ip_summed = CHECKSUM_PARTIAL;
2238
2239 tcp_insert_write_queue_before(nskb, skb, sk);
2240 tcp_highest_sack_replace(sk, skb, nskb);
2241
2242 len = 0;
2243 tcp_for_write_queue_from_safe(skb, next, sk) {
2244 copy = min_t(int, skb->len, probe_size - len);
2245 skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
2246
2247 if (skb->len <= copy) {
2248
2249
2250 TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2251
2252
2253
2254 TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
2255 tcp_skb_collapse_tstamp(nskb, skb);
2256 tcp_unlink_write_queue(skb, sk);
2257 sk_wmem_free_skb(sk, skb);
2258 } else {
2259 TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
2260 ~(TCPHDR_FIN|TCPHDR_PSH);
2261 if (!skb_shinfo(skb)->nr_frags) {
2262 skb_pull(skb, copy);
2263 } else {
2264 __pskb_trim_head(skb, copy);
2265 tcp_set_skb_tso_segs(skb, mss_now);
2266 }
2267 TCP_SKB_CB(skb)->seq += copy;
2268 }
2269
2270 len += copy;
2271
2272 if (len >= probe_size)
2273 break;
2274 }
2275 tcp_init_tso_segs(nskb, nskb->len);
2276
2277
2278
2279
2280 if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
2281
2282
2283 tp->snd_cwnd--;
2284 tcp_event_new_data_sent(sk, nskb);
2285
2286 icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
2287 tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
2288 tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
2289
2290 return 1;
2291 }
2292
2293 return -1;
2294}
2295
2296static bool tcp_pacing_check(struct sock *sk)
2297{
2298 struct tcp_sock *tp = tcp_sk(sk);
2299
2300 if (!tcp_needs_internal_pacing(sk))
2301 return false;
2302
2303 if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
2304 return false;
2305
2306 if (!hrtimer_is_queued(&tp->pacing_timer)) {
2307 hrtimer_start(&tp->pacing_timer,
2308 ns_to_ktime(tp->tcp_wstamp_ns),
2309 HRTIMER_MODE_ABS_PINNED_SOFT);
2310 sock_hold(sk);
2311 }
2312 return true;
2313}
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
2327 unsigned int factor)
2328{
2329 unsigned long limit;
2330
2331 limit = max_t(unsigned long,
2332 2 * skb->truesize,
2333 sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift));
2334 if (sk->sk_pacing_status == SK_PACING_NONE)
2335 limit = min_t(unsigned long, limit,
2336 sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
2337 limit <<= factor;
2338
2339 if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
2340 tcp_sk(sk)->tcp_tx_delay) {
2341 u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay;
2342
2343
2344
2345
2346
2347
2348 extra_bytes >>= (20 - 1);
2349 limit += extra_bytes;
2350 }
2351 if (refcount_read(&sk->sk_wmem_alloc) > limit) {
2352
2353
2354
2355
2356
2357 if (tcp_rtx_queue_empty(sk))
2358 return false;
2359
2360 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
2361
2362
2363
2364
2365 smp_mb__after_atomic();
2366 if (refcount_read(&sk->sk_wmem_alloc) > limit)
2367 return true;
2368 }
2369 return false;
2370}
2371
2372static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
2373{
2374 const u32 now = tcp_jiffies32;
2375 enum tcp_chrono old = tp->chrono_type;
2376
2377 if (old > TCP_CHRONO_UNSPEC)
2378 tp->chrono_stat[old - 1] += now - tp->chrono_start;
2379 tp->chrono_start = now;
2380 tp->chrono_type = new;
2381}
2382
2383void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
2384{
2385 struct tcp_sock *tp = tcp_sk(sk);
2386
2387
2388
2389
2390
2391
2392 if (type > tp->chrono_type)
2393 tcp_chrono_set(tp, type);
2394}
2395
2396void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
2397{
2398 struct tcp_sock *tp = tcp_sk(sk);
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408 if (tcp_rtx_and_write_queues_empty(sk))
2409 tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
2410 else if (type == tp->chrono_type)
2411 tcp_chrono_set(tp, TCP_CHRONO_BUSY);
2412}
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2429 int push_one, gfp_t gfp)
2430{
2431 struct tcp_sock *tp = tcp_sk(sk);
2432 struct sk_buff *skb;
2433 unsigned int tso_segs, sent_pkts;
2434 int cwnd_quota;
2435 int result;
2436 bool is_cwnd_limited = false, is_rwnd_limited = false;
2437 u32 max_segs;
2438
2439 sent_pkts = 0;
2440
2441 tcp_mstamp_refresh(tp);
2442 if (!push_one) {
2443
2444 result = tcp_mtu_probe(sk);
2445 if (!result) {
2446 return false;
2447 } else if (result > 0) {
2448 sent_pkts = 1;
2449 }
2450 }
2451
2452 max_segs = tcp_tso_segs(sk, mss_now);
2453 while ((skb = tcp_send_head(sk))) {
2454 unsigned int limit;
2455
2456 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
2457
2458 skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
2459 list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
2460 tcp_init_tso_segs(skb, mss_now);
2461 goto repair;
2462 }
2463
2464 if (tcp_pacing_check(sk))
2465 break;
2466
2467 tso_segs = tcp_init_tso_segs(skb, mss_now);
2468 BUG_ON(!tso_segs);
2469
2470 cwnd_quota = tcp_cwnd_test(tp, skb);
2471 if (!cwnd_quota) {
2472 if (push_one == 2)
2473
2474 cwnd_quota = 1;
2475 else
2476 break;
2477 }
2478
2479 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
2480 is_rwnd_limited = true;
2481 break;
2482 }
2483
2484 if (tso_segs == 1) {
2485 if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
2486 (tcp_skb_is_last(sk, skb) ?
2487 nonagle : TCP_NAGLE_PUSH))))
2488 break;
2489 } else {
2490 if (!push_one &&
2491 tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
2492 &is_rwnd_limited, max_segs))
2493 break;
2494 }
2495
2496 limit = mss_now;
2497 if (tso_segs > 1 && !tcp_urg_mode(tp))
2498 limit = tcp_mss_split_point(sk, skb, mss_now,
2499 min_t(unsigned int,
2500 cwnd_quota,
2501 max_segs),
2502 nonagle);
2503
2504 if (skb->len > limit &&
2505 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
2506 break;
2507
2508 if (tcp_small_queue_check(sk, skb, 0))
2509 break;
2510
2511
2512
2513
2514
2515
2516 if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
2517 break;
2518
2519 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
2520 break;
2521
2522repair:
2523
2524
2525
2526 tcp_event_new_data_sent(sk, skb);
2527
2528 tcp_minshall_update(tp, mss_now, skb);
2529 sent_pkts += tcp_skb_pcount(skb);
2530
2531 if (push_one)
2532 break;
2533 }
2534
2535 if (is_rwnd_limited)
2536 tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
2537 else
2538 tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
2539
2540 if (likely(sent_pkts)) {
2541 if (tcp_in_cwnd_reduction(sk))
2542 tp->prr_out += sent_pkts;
2543
2544
2545 if (push_one != 2)
2546 tcp_schedule_loss_probe(sk, false);
2547 is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
2548 tcp_cwnd_validate(sk, is_cwnd_limited);
2549 return false;
2550 }
2551 return !tp->packets_out && !tcp_write_queue_empty(sk);
2552}
2553
2554bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
2555{
2556 struct inet_connection_sock *icsk = inet_csk(sk);
2557 struct tcp_sock *tp = tcp_sk(sk);
2558 u32 timeout, rto_delta_us;
2559 int early_retrans;
2560
2561
2562
2563
2564 if (rcu_access_pointer(tp->fastopen_rsk))
2565 return false;
2566
2567 early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
2568
2569
2570
2571 if ((early_retrans != 3 && early_retrans != 4) ||
2572 !tp->packets_out || !tcp_is_sack(tp) ||
2573 (icsk->icsk_ca_state != TCP_CA_Open &&
2574 icsk->icsk_ca_state != TCP_CA_CWR))
2575 return false;
2576
2577
2578
2579
2580
2581 if (tp->srtt_us) {
2582 timeout = usecs_to_jiffies(tp->srtt_us >> 2);
2583 if (tp->packets_out == 1)
2584 timeout += TCP_RTO_MIN;
2585 else
2586 timeout += TCP_TIMEOUT_MIN;
2587 } else {
2588 timeout = TCP_TIMEOUT_INIT;
2589 }
2590
2591
2592 rto_delta_us = advancing_rto ?
2593 jiffies_to_usecs(inet_csk(sk)->icsk_rto) :
2594 tcp_rto_delta_us(sk);
2595 if (rto_delta_us > 0)
2596 timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
2597
2598 tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX);
2599 return true;
2600}
2601
2602
2603
2604
2605
2606static bool skb_still_in_host_queue(const struct sock *sk,
2607 const struct sk_buff *skb)
2608{
2609 if (unlikely(skb_fclone_busy(sk, skb))) {
2610 NET_INC_STATS(sock_net(sk),
2611 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
2612 return true;
2613 }
2614 return false;
2615}
2616
2617
2618
2619
2620void tcp_send_loss_probe(struct sock *sk)
2621{
2622 struct tcp_sock *tp = tcp_sk(sk);
2623 struct sk_buff *skb;
2624 int pcount;
2625 int mss = tcp_current_mss(sk);
2626
2627
2628 if (tp->tlp_high_seq)
2629 goto rearm_timer;
2630
2631 tp->tlp_retrans = 0;
2632 skb = tcp_send_head(sk);
2633 if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
2634 pcount = tp->packets_out;
2635 tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
2636 if (tp->packets_out > pcount)
2637 goto probe_sent;
2638 goto rearm_timer;
2639 }
2640 skb = skb_rb_last(&sk->tcp_rtx_queue);
2641 if (unlikely(!skb)) {
2642 WARN_ONCE(tp->packets_out,
2643 "invalid inflight: %u state %u cwnd %u mss %d\n",
2644 tp->packets_out, sk->sk_state, tp->snd_cwnd, mss);
2645 inet_csk(sk)->icsk_pending = 0;
2646 return;
2647 }
2648
2649 if (skb_still_in_host_queue(sk, skb))
2650 goto rearm_timer;
2651
2652 pcount = tcp_skb_pcount(skb);
2653 if (WARN_ON(!pcount))
2654 goto rearm_timer;
2655
2656 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
2657 if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
2658 (pcount - 1) * mss, mss,
2659 GFP_ATOMIC)))
2660 goto rearm_timer;
2661 skb = skb_rb_next(skb);
2662 }
2663
2664 if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
2665 goto rearm_timer;
2666
2667 if (__tcp_retransmit_skb(sk, skb, 1))
2668 goto rearm_timer;
2669
2670 tp->tlp_retrans = 1;
2671
2672probe_sent:
2673
2674 tp->tlp_high_seq = tp->snd_nxt;
2675
2676 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
2677
2678 inet_csk(sk)->icsk_pending = 0;
2679rearm_timer:
2680 tcp_rearm_rto(sk);
2681}
2682
2683
2684
2685
2686
2687void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
2688 int nonagle)
2689{
2690
2691
2692
2693
2694 if (unlikely(sk->sk_state == TCP_CLOSE))
2695 return;
2696
2697 if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
2698 sk_gfp_mask(sk, GFP_ATOMIC)))
2699 tcp_check_probe_timer(sk);
2700}
2701
2702
2703
2704
2705void tcp_push_one(struct sock *sk, unsigned int mss_now)
2706{
2707 struct sk_buff *skb = tcp_send_head(sk);
2708
2709 BUG_ON(!skb || skb->len < mss_now);
2710
2711 tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
2712}
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766u32 __tcp_select_window(struct sock *sk)
2767{
2768 struct inet_connection_sock *icsk = inet_csk(sk);
2769 struct tcp_sock *tp = tcp_sk(sk);
2770
2771
2772
2773
2774
2775
2776 int mss = icsk->icsk_ack.rcv_mss;
2777 int free_space = tcp_space(sk);
2778 int allowed_space = tcp_full_space(sk);
2779 int full_space, window;
2780
2781 if (sk_is_mptcp(sk))
2782 mptcp_space(sk, &free_space, &allowed_space);
2783
2784 full_space = min_t(int, tp->window_clamp, allowed_space);
2785
2786 if (unlikely(mss > full_space)) {
2787 mss = full_space;
2788 if (mss <= 0)
2789 return 0;
2790 }
2791 if (free_space < (full_space >> 1)) {
2792 icsk->icsk_ack.quick = 0;
2793
2794 if (tcp_under_memory_pressure(sk))
2795 tp->rcv_ssthresh = min(tp->rcv_ssthresh,
2796 4U * tp->advmss);
2797
2798
2799
2800
2801 free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
2802
2803
2804
2805
2806
2807
2808
2809
2810 if (free_space < (allowed_space >> 4) || free_space < mss)
2811 return 0;
2812 }
2813
2814 if (free_space > tp->rcv_ssthresh)
2815 free_space = tp->rcv_ssthresh;
2816
2817
2818
2819
2820 if (tp->rx_opt.rcv_wscale) {
2821 window = free_space;
2822
2823
2824
2825
2826
2827 window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
2828 } else {
2829 window = tp->rcv_wnd;
2830
2831
2832
2833
2834
2835
2836
2837
2838 if (window <= free_space - mss || window > free_space)
2839 window = rounddown(free_space, mss);
2840 else if (mss == full_space &&
2841 free_space > window + (full_space >> 1))
2842 window = free_space;
2843 }
2844
2845 return window;
2846}
2847
2848void tcp_skb_collapse_tstamp(struct sk_buff *skb,
2849 const struct sk_buff *next_skb)
2850{
2851 if (unlikely(tcp_has_tx_tstamp(next_skb))) {
2852 const struct skb_shared_info *next_shinfo =
2853 skb_shinfo(next_skb);
2854 struct skb_shared_info *shinfo = skb_shinfo(skb);
2855
2856 shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
2857 shinfo->tskey = next_shinfo->tskey;
2858 TCP_SKB_CB(skb)->txstamp_ack |=
2859 TCP_SKB_CB(next_skb)->txstamp_ack;
2860 }
2861}
2862
2863
2864static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2865{
2866 struct tcp_sock *tp = tcp_sk(sk);
2867 struct sk_buff *next_skb = skb_rb_next(skb);
2868 int next_skb_size;
2869
2870 next_skb_size = next_skb->len;
2871
2872 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
2873
2874 if (next_skb_size) {
2875 if (next_skb_size <= skb_availroom(skb))
2876 skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
2877 next_skb_size);
2878 else if (!tcp_skb_shift(skb, next_skb, 1, next_skb_size))
2879 return false;
2880 }
2881 tcp_highest_sack_replace(sk, next_skb, skb);
2882
2883
2884 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
2885
2886
2887 TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
2888
2889
2890
2891
2892 TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
2893 TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
2894
2895
2896 tcp_clear_retrans_hints_partial(tp);
2897 if (next_skb == tp->retransmit_skb_hint)
2898 tp->retransmit_skb_hint = skb;
2899
2900 tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
2901
2902 tcp_skb_collapse_tstamp(skb, next_skb);
2903
2904 tcp_rtx_queue_unlink_and_free(next_skb, sk);
2905 return true;
2906}
2907
2908
2909static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
2910{
2911 if (tcp_skb_pcount(skb) > 1)
2912 return false;
2913 if (skb_cloned(skb))
2914 return false;
2915
2916 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
2917 return false;
2918
2919 return true;
2920}
2921
2922
2923
2924
2925static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
2926 int space)
2927{
2928 struct tcp_sock *tp = tcp_sk(sk);
2929 struct sk_buff *skb = to, *tmp;
2930 bool first = true;
2931
2932 if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)
2933 return;
2934 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2935 return;
2936
2937 skb_rbtree_walk_from_safe(skb, tmp) {
2938 if (!tcp_can_collapse(sk, skb))
2939 break;
2940
2941 if (!tcp_skb_can_collapse(to, skb))
2942 break;
2943
2944 space -= skb->len;
2945
2946 if (first) {
2947 first = false;
2948 continue;
2949 }
2950
2951 if (space < 0)
2952 break;
2953
2954 if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
2955 break;
2956
2957 if (!tcp_collapse_retrans(sk, to))
2958 break;
2959 }
2960}
2961
2962
2963
2964
2965
2966int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2967{
2968 struct inet_connection_sock *icsk = inet_csk(sk);
2969 struct tcp_sock *tp = tcp_sk(sk);
2970 unsigned int cur_mss;
2971 int diff, len, err;
2972
2973
2974
2975 if (icsk->icsk_mtup.probe_size)
2976 icsk->icsk_mtup.probe_size = 0;
2977
2978
2979
2980
2981 if (refcount_read(&sk->sk_wmem_alloc) >
2982 min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2),
2983 sk->sk_sndbuf))
2984 return -EAGAIN;
2985
2986 if (skb_still_in_host_queue(sk, skb))
2987 return -EBUSY;
2988
2989 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
2990 if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
2991 WARN_ON_ONCE(1);
2992 return -EINVAL;
2993 }
2994 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
2995 return -ENOMEM;
2996 }
2997
2998 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
2999 return -EHOSTUNREACH;
3000
3001 cur_mss = tcp_current_mss(sk);
3002
3003
3004
3005
3006
3007
3008 if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
3009 TCP_SKB_CB(skb)->seq != tp->snd_una)
3010 return -EAGAIN;
3011
3012 len = cur_mss * segs;
3013 if (skb->len > len) {
3014 if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
3015 cur_mss, GFP_ATOMIC))
3016 return -ENOMEM;
3017 } else {
3018 if (skb_unclone(skb, GFP_ATOMIC))
3019 return -ENOMEM;
3020
3021 diff = tcp_skb_pcount(skb);
3022 tcp_set_skb_tso_segs(skb, cur_mss);
3023 diff -= tcp_skb_pcount(skb);
3024 if (diff)
3025 tcp_adjust_pcount(sk, skb, diff);
3026 if (skb->len < cur_mss)
3027 tcp_retrans_try_collapse(sk, skb, cur_mss);
3028 }
3029
3030
3031 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
3032 tcp_ecn_clear_syn(sk, skb);
3033
3034
3035 segs = tcp_skb_pcount(skb);
3036 TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
3037 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
3038 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
3039 tp->total_retrans += segs;
3040 tp->bytes_retrans += skb->len;
3041
3042
3043
3044
3045
3046 if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
3047 skb_headroom(skb) >= 0xFFFF)) {
3048 struct sk_buff *nskb;
3049
3050 tcp_skb_tsorted_save(skb) {
3051 nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
3052 if (nskb) {
3053 nskb->dev = NULL;
3054 err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC);
3055 } else {
3056 err = -ENOBUFS;
3057 }
3058 } tcp_skb_tsorted_restore(skb);
3059
3060 if (!err) {
3061 tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
3062 tcp_rate_skb_sent(sk, skb);
3063 }
3064 } else {
3065 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
3066 }
3067
3068
3069
3070
3071 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
3072
3073 if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
3074 tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
3075 TCP_SKB_CB(skb)->seq, segs, err);
3076
3077 if (likely(!err)) {
3078 trace_tcp_retransmit_skb(sk, skb);
3079 } else if (err != -EBUSY) {
3080 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
3081 }
3082 return err;
3083}
3084
3085int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
3086{
3087 struct tcp_sock *tp = tcp_sk(sk);
3088 int err = __tcp_retransmit_skb(sk, skb, segs);
3089
3090 if (err == 0) {
3091#if FASTRETRANS_DEBUG > 0
3092 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
3093 net_dbg_ratelimited("retrans_out leaked\n");
3094 }
3095#endif
3096 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
3097 tp->retrans_out += tcp_skb_pcount(skb);
3098 }
3099
3100
3101 if (!tp->retrans_stamp)
3102 tp->retrans_stamp = tcp_skb_timestamp(skb);
3103
3104 if (tp->undo_retrans < 0)
3105 tp->undo_retrans = 0;
3106 tp->undo_retrans += tcp_skb_pcount(skb);
3107 return err;
3108}
3109
3110
3111
3112
3113
3114
3115void tcp_xmit_retransmit_queue(struct sock *sk)
3116{
3117 const struct inet_connection_sock *icsk = inet_csk(sk);
3118 struct sk_buff *skb, *rtx_head, *hole = NULL;
3119 struct tcp_sock *tp = tcp_sk(sk);
3120 bool rearm_timer = false;
3121 u32 max_segs;
3122 int mib_idx;
3123
3124 if (!tp->packets_out)
3125 return;
3126
3127 rtx_head = tcp_rtx_queue_head(sk);
3128 skb = tp->retransmit_skb_hint ?: rtx_head;
3129 max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
3130 skb_rbtree_walk_from(skb) {
3131 __u8 sacked;
3132 int segs;
3133
3134 if (tcp_pacing_check(sk))
3135 break;
3136
3137
3138 if (!hole)
3139 tp->retransmit_skb_hint = skb;
3140
3141 segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
3142 if (segs <= 0)
3143 break;
3144 sacked = TCP_SKB_CB(skb)->sacked;
3145
3146
3147
3148 segs = min_t(int, segs, max_segs);
3149
3150 if (tp->retrans_out >= tp->lost_out) {
3151 break;
3152 } else if (!(sacked & TCPCB_LOST)) {
3153 if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
3154 hole = skb;
3155 continue;
3156
3157 } else {
3158 if (icsk->icsk_ca_state != TCP_CA_Loss)
3159 mib_idx = LINUX_MIB_TCPFASTRETRANS;
3160 else
3161 mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
3162 }
3163
3164 if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
3165 continue;
3166
3167 if (tcp_small_queue_check(sk, skb, 1))
3168 break;
3169
3170 if (tcp_retransmit_skb(sk, skb, segs))
3171 break;
3172
3173 NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
3174
3175 if (tcp_in_cwnd_reduction(sk))
3176 tp->prr_out += tcp_skb_pcount(skb);
3177
3178 if (skb == rtx_head &&
3179 icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
3180 rearm_timer = true;
3181
3182 }
3183 if (rearm_timer)
3184 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
3185 inet_csk(sk)->icsk_rto,
3186 TCP_RTO_MAX);
3187}
3188
3189
3190
3191
3192
3193
3194
3195
3196void sk_forced_mem_schedule(struct sock *sk, int size)
3197{
3198 int amt;
3199
3200 if (size <= sk->sk_forward_alloc)
3201 return;
3202 amt = sk_mem_pages(size);
3203 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
3204 sk_memory_allocated_add(sk, amt);
3205
3206 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3207 mem_cgroup_charge_skmem(sk->sk_memcg, amt);
3208}
3209
3210
3211
3212
3213void tcp_send_fin(struct sock *sk)
3214{
3215 struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk);
3216 struct tcp_sock *tp = tcp_sk(sk);
3217
3218
3219
3220
3221
3222
3223 tskb = tail;
3224 if (!tskb && tcp_under_memory_pressure(sk))
3225 tskb = skb_rb_last(&sk->tcp_rtx_queue);
3226
3227 if (tskb) {
3228 TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
3229 TCP_SKB_CB(tskb)->end_seq++;
3230 tp->write_seq++;
3231 if (!tail) {
3232
3233
3234
3235
3236
3237
3238 WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1);
3239 return;
3240 }
3241 } else {
3242 skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
3243 if (unlikely(!skb))
3244 return;
3245
3246 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
3247 skb_reserve(skb, MAX_TCP_HEADER);
3248 sk_forced_mem_schedule(sk, skb->truesize);
3249
3250 tcp_init_nondata_skb(skb, tp->write_seq,
3251 TCPHDR_ACK | TCPHDR_FIN);
3252 tcp_queue_skb(sk, skb);
3253 }
3254 __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
3255}
3256
3257
3258
3259
3260
3261
3262void tcp_send_active_reset(struct sock *sk, gfp_t priority)
3263{
3264 struct sk_buff *skb;
3265
3266 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
3267
3268
3269 skb = alloc_skb(MAX_TCP_HEADER, priority);
3270 if (!skb) {
3271 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
3272 return;
3273 }
3274
3275
3276 skb_reserve(skb, MAX_TCP_HEADER);
3277 tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
3278 TCPHDR_ACK | TCPHDR_RST);
3279 tcp_mstamp_refresh(tcp_sk(sk));
3280
3281 if (tcp_transmit_skb(sk, skb, 0, priority))
3282 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
3283
3284
3285
3286
3287 trace_tcp_send_reset(sk, NULL);
3288}
3289
3290
3291
3292
3293
3294
3295
3296int tcp_send_synack(struct sock *sk)
3297{
3298 struct sk_buff *skb;
3299
3300 skb = tcp_rtx_queue_head(sk);
3301 if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
3302 pr_err("%s: wrong queue state\n", __func__);
3303 return -EFAULT;
3304 }
3305 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
3306 if (skb_cloned(skb)) {
3307 struct sk_buff *nskb;
3308
3309 tcp_skb_tsorted_save(skb) {
3310 nskb = skb_copy(skb, GFP_ATOMIC);
3311 } tcp_skb_tsorted_restore(skb);
3312 if (!nskb)
3313 return -ENOMEM;
3314 INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
3315 tcp_highest_sack_replace(sk, skb, nskb);
3316 tcp_rtx_queue_unlink_and_free(skb, sk);
3317 __skb_header_release(nskb);
3318 tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
3319 sk_wmem_queued_add(sk, nskb->truesize);
3320 sk_mem_charge(sk, nskb->truesize);
3321 skb = nskb;
3322 }
3323
3324 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
3325 tcp_ecn_send_synack(sk, skb);
3326 }
3327 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
3328}
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
3340 struct request_sock *req,
3341 struct tcp_fastopen_cookie *foc,
3342 enum tcp_synack_type synack_type)
3343{
3344 struct inet_request_sock *ireq = inet_rsk(req);
3345 const struct tcp_sock *tp = tcp_sk(sk);
3346 struct tcp_md5sig_key *md5 = NULL;
3347 struct tcp_out_options opts;
3348 struct sk_buff *skb;
3349 int tcp_header_size;
3350 struct tcphdr *th;
3351 int mss;
3352 u64 now;
3353
3354 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
3355 if (unlikely(!skb)) {
3356 dst_release(dst);
3357 return NULL;
3358 }
3359
3360 skb_reserve(skb, MAX_TCP_HEADER);
3361
3362 switch (synack_type) {
3363 case TCP_SYNACK_NORMAL:
3364 skb_set_owner_w(skb, req_to_sk(req));
3365 break;
3366 case TCP_SYNACK_COOKIE:
3367
3368
3369
3370 break;
3371 case TCP_SYNACK_FASTOPEN:
3372
3373
3374
3375
3376 skb_set_owner_w(skb, (struct sock *)sk);
3377 break;
3378 }
3379 skb_dst_set(skb, dst);
3380
3381 mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
3382
3383 memset(&opts, 0, sizeof(opts));
3384 now = tcp_clock_ns();
3385#ifdef CONFIG_SYN_COOKIES
3386 if (unlikely(req->cookie_ts))
3387 skb->skb_mstamp_ns = cookie_init_timestamp(req, now);
3388 else
3389#endif
3390 {
3391 skb->skb_mstamp_ns = now;
3392 if (!tcp_rsk(req)->snt_synack)
3393 tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
3394 }
3395
3396#ifdef CONFIG_TCP_MD5SIG
3397 rcu_read_lock();
3398 md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
3399#endif
3400 skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
3401 tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
3402 foc, synack_type) + sizeof(*th);
3403
3404 skb_push(skb, tcp_header_size);
3405 skb_reset_transport_header(skb);
3406
3407 th = (struct tcphdr *)skb->data;
3408 memset(th, 0, sizeof(struct tcphdr));
3409 th->syn = 1;
3410 th->ack = 1;
3411 tcp_ecn_make_synack(req, th);
3412 th->source = htons(ireq->ir_num);
3413 th->dest = ireq->ir_rmt_port;
3414 skb->mark = ireq->ir_mark;
3415 skb->ip_summed = CHECKSUM_PARTIAL;
3416 th->seq = htonl(tcp_rsk(req)->snt_isn);
3417
3418 th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
3419
3420
3421 th->window = htons(min(req->rsk_rcv_wnd, 65535U));
3422 tcp_options_write((__be32 *)(th + 1), NULL, &opts);
3423 th->doff = (tcp_header_size >> 2);
3424 __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
3425
3426#ifdef CONFIG_TCP_MD5SIG
3427
3428 if (md5)
3429 tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
3430 md5, req_to_sk(req), skb);
3431 rcu_read_unlock();
3432#endif
3433
3434 skb->skb_mstamp_ns = now;
3435 tcp_add_tx_delay(skb, tp);
3436
3437 return skb;
3438}
3439EXPORT_SYMBOL(tcp_make_synack);
3440
3441static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
3442{
3443 struct inet_connection_sock *icsk = inet_csk(sk);
3444 const struct tcp_congestion_ops *ca;
3445 u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
3446
3447 if (ca_key == TCP_CA_UNSPEC)
3448 return;
3449
3450 rcu_read_lock();
3451 ca = tcp_ca_find_key(ca_key);
3452 if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
3453 bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
3454 icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
3455 icsk->icsk_ca_ops = ca;
3456 }
3457 rcu_read_unlock();
3458}
3459
3460
3461static void tcp_connect_init(struct sock *sk)
3462{
3463 const struct dst_entry *dst = __sk_dst_get(sk);
3464 struct tcp_sock *tp = tcp_sk(sk);
3465 __u8 rcv_wscale;
3466 u32 rcv_wnd;
3467
3468
3469
3470
3471 tp->tcp_header_len = sizeof(struct tcphdr);
3472 if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
3473 tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
3474
3475#ifdef CONFIG_TCP_MD5SIG
3476 if (tp->af_specific->md5_lookup(sk, sk))
3477 tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
3478#endif
3479
3480
3481 if (tp->rx_opt.user_mss)
3482 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
3483 tp->max_window = 0;
3484 tcp_mtup_init(sk);
3485 tcp_sync_mss(sk, dst_mtu(dst));
3486
3487 tcp_ca_dst_init(sk, dst);
3488
3489 if (!tp->window_clamp)
3490 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
3491 tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
3492
3493 tcp_initialize_rcv_mss(sk);
3494
3495
3496 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
3497 (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
3498 tp->window_clamp = tcp_full_space(sk);
3499
3500 rcv_wnd = tcp_rwnd_init_bpf(sk);
3501 if (rcv_wnd == 0)
3502 rcv_wnd = dst_metric(dst, RTAX_INITRWND);
3503
3504 tcp_select_initial_window(sk, tcp_full_space(sk),
3505 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
3506 &tp->rcv_wnd,
3507 &tp->window_clamp,
3508 sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
3509 &rcv_wscale,
3510 rcv_wnd);
3511
3512 tp->rx_opt.rcv_wscale = rcv_wscale;
3513 tp->rcv_ssthresh = tp->rcv_wnd;
3514
3515 sk->sk_err = 0;
3516 sock_reset_flag(sk, SOCK_DONE);
3517 tp->snd_wnd = 0;
3518 tcp_init_wl(tp, 0);
3519 tcp_write_queue_purge(sk);
3520 tp->snd_una = tp->write_seq;
3521 tp->snd_sml = tp->write_seq;
3522 tp->snd_up = tp->write_seq;
3523 WRITE_ONCE(tp->snd_nxt, tp->write_seq);
3524
3525 if (likely(!tp->repair))
3526 tp->rcv_nxt = 0;
3527 else
3528 tp->rcv_tstamp = tcp_jiffies32;
3529 tp->rcv_wup = tp->rcv_nxt;
3530 WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
3531
3532 inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
3533 inet_csk(sk)->icsk_retransmits = 0;
3534 tcp_clear_retrans(tp);
3535}
3536
3537static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
3538{
3539 struct tcp_sock *tp = tcp_sk(sk);
3540 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
3541
3542 tcb->end_seq += skb->len;
3543 __skb_header_release(skb);
3544 sk_wmem_queued_add(sk, skb->truesize);
3545 sk_mem_charge(sk, skb->truesize);
3546 WRITE_ONCE(tp->write_seq, tcb->end_seq);
3547 tp->packets_out += tcp_skb_pcount(skb);
3548}
3549
3550
3551
3552
3553
3554
3555
3556
3557static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
3558{
3559 struct tcp_sock *tp = tcp_sk(sk);
3560 struct tcp_fastopen_request *fo = tp->fastopen_req;
3561 int space, err = 0;
3562 struct sk_buff *syn_data;
3563
3564 tp->rx_opt.mss_clamp = tp->advmss;
3565 if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
3566 goto fallback;
3567
3568
3569
3570
3571
3572 tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
3573
3574 space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
3575 MAX_TCP_OPTION_SPACE;
3576
3577 space = min_t(size_t, space, fo->size);
3578
3579
3580 space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
3581
3582 syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false);
3583 if (!syn_data)
3584 goto fallback;
3585 syn_data->ip_summed = CHECKSUM_PARTIAL;
3586 memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
3587 if (space) {
3588 int copied = copy_from_iter(skb_put(syn_data, space), space,
3589 &fo->data->msg_iter);
3590 if (unlikely(!copied)) {
3591 tcp_skb_tsorted_anchor_cleanup(syn_data);
3592 kfree_skb(syn_data);
3593 goto fallback;
3594 }
3595 if (copied != space) {
3596 skb_trim(syn_data, copied);
3597 space = copied;
3598 }
3599 skb_zcopy_set(syn_data, fo->uarg, NULL);
3600 }
3601
3602 if (space == fo->size)
3603 fo->data = NULL;
3604 fo->copied = space;
3605
3606 tcp_connect_queue_skb(sk, syn_data);
3607 if (syn_data->len)
3608 tcp_chrono_start(sk, TCP_CHRONO_BUSY);
3609
3610 err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
3611
3612 syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;
3613
3614
3615
3616
3617
3618
3619 TCP_SKB_CB(syn_data)->seq++;
3620 TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
3621 if (!err) {
3622 tp->syn_data = (fo->copied > 0);
3623 tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
3624 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
3625 goto done;
3626 }
3627
3628
3629 __skb_queue_tail(&sk->sk_write_queue, syn_data);
3630 tp->packets_out -= tcp_skb_pcount(syn_data);
3631
3632fallback:
3633
3634 if (fo->cookie.len > 0)
3635 fo->cookie.len = 0;
3636 err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
3637 if (err)
3638 tp->syn_fastopen = 0;
3639done:
3640 fo->cookie.len = -1;
3641 return err;
3642}
3643
3644
3645int tcp_connect(struct sock *sk)
3646{
3647 struct tcp_sock *tp = tcp_sk(sk);
3648 struct sk_buff *buff;
3649 int err;
3650
3651 tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
3652
3653 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
3654 return -EHOSTUNREACH;
3655
3656 tcp_connect_init(sk);
3657
3658 if (unlikely(tp->repair)) {
3659 tcp_finish_connect(sk, NULL);
3660 return 0;
3661 }
3662
3663 buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
3664 if (unlikely(!buff))
3665 return -ENOBUFS;
3666
3667 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
3668 tcp_mstamp_refresh(tp);
3669 tp->retrans_stamp = tcp_time_stamp(tp);
3670 tcp_connect_queue_skb(sk, buff);
3671 tcp_ecn_send_syn(sk, buff);
3672 tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
3673
3674
3675 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
3676 tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
3677 if (err == -ECONNREFUSED)
3678 return err;
3679
3680
3681
3682
3683 WRITE_ONCE(tp->snd_nxt, tp->write_seq);
3684 tp->pushed_seq = tp->write_seq;
3685 buff = tcp_send_head(sk);
3686 if (unlikely(buff)) {
3687 WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq);
3688 tp->pushed_seq = TCP_SKB_CB(buff)->seq;
3689 }
3690 TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
3691
3692
3693 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
3694 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
3695 return 0;
3696}
3697EXPORT_SYMBOL(tcp_connect);
3698
3699
3700
3701
3702
3703void tcp_send_delayed_ack(struct sock *sk)
3704{
3705 struct inet_connection_sock *icsk = inet_csk(sk);
3706 int ato = icsk->icsk_ack.ato;
3707 unsigned long timeout;
3708
3709 if (ato > TCP_DELACK_MIN) {
3710 const struct tcp_sock *tp = tcp_sk(sk);
3711 int max_ato = HZ / 2;
3712
3713 if (inet_csk_in_pingpong_mode(sk) ||
3714 (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
3715 max_ato = TCP_DELACK_MAX;
3716
3717
3718
3719
3720
3721
3722
3723 if (tp->srtt_us) {
3724 int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
3725 TCP_DELACK_MIN);
3726
3727 if (rtt < max_ato)
3728 max_ato = rtt;
3729 }
3730
3731 ato = min(ato, max_ato);
3732 }
3733
3734
3735 timeout = jiffies + ato;
3736
3737
3738 if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
3739
3740
3741
3742 if (icsk->icsk_ack.blocked ||
3743 time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
3744 tcp_send_ack(sk);
3745 return;
3746 }
3747
3748 if (!time_before(timeout, icsk->icsk_ack.timeout))
3749 timeout = icsk->icsk_ack.timeout;
3750 }
3751 icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
3752 icsk->icsk_ack.timeout = timeout;
3753 sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
3754}
3755
3756
3757void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
3758{
3759 struct sk_buff *buff;
3760
3761
3762 if (sk->sk_state == TCP_CLOSE)
3763 return;
3764
3765
3766
3767
3768
3769 buff = alloc_skb(MAX_TCP_HEADER,
3770 sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
3771 if (unlikely(!buff)) {
3772 inet_csk_schedule_ack(sk);
3773 inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
3774 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
3775 TCP_DELACK_MAX, TCP_RTO_MAX);
3776 return;
3777 }
3778
3779
3780 skb_reserve(buff, MAX_TCP_HEADER);
3781 tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
3782
3783
3784
3785
3786
3787 skb_set_tcp_pure_ack(buff);
3788
3789
3790 __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
3791}
3792EXPORT_SYMBOL_GPL(__tcp_send_ack);
3793
3794void tcp_send_ack(struct sock *sk)
3795{
3796 __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
3797}
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
3811{
3812 struct tcp_sock *tp = tcp_sk(sk);
3813 struct sk_buff *skb;
3814
3815
3816 skb = alloc_skb(MAX_TCP_HEADER,
3817 sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
3818 if (!skb)
3819 return -1;
3820
3821
3822 skb_reserve(skb, MAX_TCP_HEADER);
3823
3824
3825
3826
3827 tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
3828 NET_INC_STATS(sock_net(sk), mib);
3829 return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
3830}
3831
3832
3833void tcp_send_window_probe(struct sock *sk)
3834{
3835 if (sk->sk_state == TCP_ESTABLISHED) {
3836 tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
3837 tcp_mstamp_refresh(tcp_sk(sk));
3838 tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
3839 }
3840}
3841
3842
3843int tcp_write_wakeup(struct sock *sk, int mib)
3844{
3845 struct tcp_sock *tp = tcp_sk(sk);
3846 struct sk_buff *skb;
3847
3848 if (sk->sk_state == TCP_CLOSE)
3849 return -1;
3850
3851 skb = tcp_send_head(sk);
3852 if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
3853 int err;
3854 unsigned int mss = tcp_current_mss(sk);
3855 unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
3856
3857 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
3858 tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
3859
3860
3861
3862
3863
3864 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
3865 skb->len > mss) {
3866 seg_size = min(seg_size, mss);
3867 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3868 if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
3869 skb, seg_size, mss, GFP_ATOMIC))
3870 return -1;
3871 } else if (!tcp_skb_pcount(skb))
3872 tcp_set_skb_tso_segs(skb, mss);
3873
3874 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3875 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
3876 if (!err)
3877 tcp_event_new_data_sent(sk, skb);
3878 return err;
3879 } else {
3880 if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
3881 tcp_xmit_probe_skb(sk, 1, mib);
3882 return tcp_xmit_probe_skb(sk, 0, mib);
3883 }
3884}
3885
3886
3887
3888
3889void tcp_send_probe0(struct sock *sk)
3890{
3891 struct inet_connection_sock *icsk = inet_csk(sk);
3892 struct tcp_sock *tp = tcp_sk(sk);
3893 struct net *net = sock_net(sk);
3894 unsigned long timeout;
3895 int err;
3896
3897 err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
3898
3899 if (tp->packets_out || tcp_write_queue_empty(sk)) {
3900
3901 icsk->icsk_probes_out = 0;
3902 icsk->icsk_backoff = 0;
3903 return;
3904 }
3905
3906 icsk->icsk_probes_out++;
3907 if (err <= 0) {
3908 if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
3909 icsk->icsk_backoff++;
3910 timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
3911 } else {
3912
3913
3914
3915 timeout = TCP_RESOURCE_PROBE_INTERVAL;
3916 }
3917 tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX);
3918}
3919
3920int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
3921{
3922 const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
3923 struct flowi fl;
3924 int res;
3925
3926 tcp_rsk(req)->txhash = net_tx_rndhash();
3927 res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);
3928 if (!res) {
3929 __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
3930 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
3931 if (unlikely(tcp_passive_fastopen(sk)))
3932 tcp_sk(sk)->total_retrans++;
3933 trace_tcp_retransmit_synack(sk, req);
3934 }
3935 return res;
3936}
3937EXPORT_SYMBOL(tcp_rtx_synack);
3938