1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38#define pr_fmt(fmt) "TCP: " fmt
39
40#include <net/tcp.h>
41#include <net/mptcp.h>
42
43#include <linux/compiler.h>
44#include <linux/gfp.h>
45#include <linux/module.h>
46#include <linux/static_key.h>
47
48#include <trace/events/tcp.h>
49
50
51
52
53void tcp_mstamp_refresh(struct tcp_sock *tp)
54{
55 u64 val = tcp_clock_ns();
56
57 tp->tcp_clock_cache = val;
58 tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC);
59}
60
61static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
62 int push_one, gfp_t gfp);
63
64
65static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
66{
67 struct inet_connection_sock *icsk = inet_csk(sk);
68 struct tcp_sock *tp = tcp_sk(sk);
69 unsigned int prior_packets = tp->packets_out;
70
71 WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
72
73 __skb_unlink(skb, &sk->sk_write_queue);
74 tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
75
76 if (tp->highest_sack == NULL)
77 tp->highest_sack = skb;
78
79 tp->packets_out += tcp_skb_pcount(skb);
80 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
81 tcp_rearm_rto(sk);
82
83 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
84 tcp_skb_pcount(skb));
85}
86
87
88
89
90
91
92
93
94static inline __u32 tcp_acceptable_seq(const struct sock *sk)
95{
96 const struct tcp_sock *tp = tcp_sk(sk);
97
98 if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
99 (tp->rx_opt.wscale_ok &&
100 ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
101 return tp->snd_nxt;
102 else
103 return tcp_wnd_end(tp);
104}
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120static __u16 tcp_advertise_mss(struct sock *sk)
121{
122 struct tcp_sock *tp = tcp_sk(sk);
123 const struct dst_entry *dst = __sk_dst_get(sk);
124 int mss = tp->advmss;
125
126 if (dst) {
127 unsigned int metric = dst_metric_advmss(dst);
128
129 if (metric < mss) {
130 mss = metric;
131 tp->advmss = mss;
132 }
133 }
134
135 return (__u16)mss;
136}
137
138
139
140
141void tcp_cwnd_restart(struct sock *sk, s32 delta)
142{
143 struct tcp_sock *tp = tcp_sk(sk);
144 u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
145 u32 cwnd = tp->snd_cwnd;
146
147 tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
148
149 tp->snd_ssthresh = tcp_current_ssthresh(sk);
150 restart_cwnd = min(restart_cwnd, cwnd);
151
152 while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
153 cwnd >>= 1;
154 tp->snd_cwnd = max(cwnd, restart_cwnd);
155 tp->snd_cwnd_stamp = tcp_jiffies32;
156 tp->snd_cwnd_used = 0;
157}
158
159
160static void tcp_event_data_sent(struct tcp_sock *tp,
161 struct sock *sk)
162{
163 struct inet_connection_sock *icsk = inet_csk(sk);
164 const u32 now = tcp_jiffies32;
165
166 if (tcp_packets_in_flight(tp) == 0)
167 tcp_ca_event(sk, CA_EVENT_TX_START);
168
169
170
171
172
173
174 if (before(tp->lsndtime, icsk->icsk_ack.lrcvtime) &&
175 (u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
176 inet_csk_inc_pingpong_cnt(sk);
177
178 tp->lsndtime = now;
179}
180
181
182static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
183 u32 rcv_nxt)
184{
185 struct tcp_sock *tp = tcp_sk(sk);
186
187 if (unlikely(tp->compressed_ack)) {
188 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
189 tp->compressed_ack);
190 tp->compressed_ack = 0;
191 if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
192 __sock_put(sk);
193 }
194
195 if (unlikely(rcv_nxt != tp->rcv_nxt))
196 return;
197 tcp_dec_quickack_mode(sk, pkts);
198 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
199}
200
201
202
203
204
205
206
207
208void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
209 __u32 *rcv_wnd, __u32 *window_clamp,
210 int wscale_ok, __u8 *rcv_wscale,
211 __u32 init_rcv_wnd)
212{
213 unsigned int space = (__space < 0 ? 0 : __space);
214
215
216 if (*window_clamp == 0)
217 (*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
218 space = min(*window_clamp, space);
219
220
221 if (space > mss)
222 space = rounddown(space, mss);
223
224
225
226
227
228
229
230
231
232 if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
233 (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
234 else
235 (*rcv_wnd) = min_t(u32, space, U16_MAX);
236
237 if (init_rcv_wnd)
238 *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
239
240 *rcv_wscale = 0;
241 if (wscale_ok) {
242
243 space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
244 space = max_t(u32, space, sysctl_rmem_max);
245 space = min_t(u32, space, *window_clamp);
246 *rcv_wscale = clamp_t(int, ilog2(space) - 15,
247 0, TCP_MAX_WSCALE);
248 }
249
250 (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
251}
252EXPORT_SYMBOL(tcp_select_initial_window);
253
254
255
256
257
258
259static u16 tcp_select_window(struct sock *sk)
260{
261 struct tcp_sock *tp = tcp_sk(sk);
262 u32 old_win = tp->rcv_wnd;
263 u32 cur_win = tcp_receive_window(tp);
264 u32 new_win = __tcp_select_window(sk);
265
266
267 if (new_win < cur_win) {
268
269
270
271
272
273
274
275 if (new_win == 0)
276 NET_INC_STATS(sock_net(sk),
277 LINUX_MIB_TCPWANTZEROWINDOWADV);
278 new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
279 }
280 tp->rcv_wnd = new_win;
281 tp->rcv_wup = tp->rcv_nxt;
282
283
284
285
286 if (!tp->rx_opt.rcv_wscale &&
287 sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
288 new_win = min(new_win, MAX_TCP_WINDOW);
289 else
290 new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
291
292
293 new_win >>= tp->rx_opt.rcv_wscale;
294
295
296 if (new_win == 0) {
297 tp->pred_flags = 0;
298 if (old_win)
299 NET_INC_STATS(sock_net(sk),
300 LINUX_MIB_TCPTOZEROWINDOWADV);
301 } else if (old_win == 0) {
302 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
303 }
304
305 return new_win;
306}
307
308
309static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
310{
311 const struct tcp_sock *tp = tcp_sk(sk);
312
313 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
314 if (!(tp->ecn_flags & TCP_ECN_OK))
315 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
316 else if (tcp_ca_needs_ecn(sk) ||
317 tcp_bpf_ca_needs_ecn(sk))
318 INET_ECN_xmit(sk);
319}
320
321
322static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
323{
324 struct tcp_sock *tp = tcp_sk(sk);
325 bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
326 bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
327 tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
328
329 if (!use_ecn) {
330 const struct dst_entry *dst = __sk_dst_get(sk);
331
332 if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
333 use_ecn = true;
334 }
335
336 tp->ecn_flags = 0;
337
338 if (use_ecn) {
339 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
340 tp->ecn_flags = TCP_ECN_OK;
341 if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
342 INET_ECN_xmit(sk);
343 }
344}
345
346static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
347{
348 if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)
349
350
351
352 TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
353}
354
355static void
356tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
357{
358 if (inet_rsk(req)->ecn_ok)
359 th->ece = 1;
360}
361
362
363
364
365static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
366 struct tcphdr *th, int tcp_header_len)
367{
368 struct tcp_sock *tp = tcp_sk(sk);
369
370 if (tp->ecn_flags & TCP_ECN_OK) {
371
372 if (skb->len != tcp_header_len &&
373 !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
374 INET_ECN_xmit(sk);
375 if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
376 tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
377 th->cwr = 1;
378 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
379 }
380 } else if (!tcp_ca_needs_ecn(sk)) {
381
382 INET_ECN_dontxmit(sk);
383 }
384 if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
385 th->ece = 1;
386 }
387}
388
389
390
391
392static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
393{
394 skb->ip_summed = CHECKSUM_PARTIAL;
395
396 TCP_SKB_CB(skb)->tcp_flags = flags;
397 TCP_SKB_CB(skb)->sacked = 0;
398
399 tcp_skb_pcount_set(skb, 1);
400
401 TCP_SKB_CB(skb)->seq = seq;
402 if (flags & (TCPHDR_SYN | TCPHDR_FIN))
403 seq++;
404 TCP_SKB_CB(skb)->end_seq = seq;
405}
406
407static inline bool tcp_urg_mode(const struct tcp_sock *tp)
408{
409 return tp->snd_una != tp->snd_up;
410}
411
412#define OPTION_SACK_ADVERTISE (1 << 0)
413#define OPTION_TS (1 << 1)
414#define OPTION_MD5 (1 << 2)
415#define OPTION_WSCALE (1 << 3)
416#define OPTION_FAST_OPEN_COOKIE (1 << 8)
417#define OPTION_SMC (1 << 9)
418#define OPTION_MPTCP (1 << 10)
419
420static void smc_options_write(__be32 *ptr, u16 *options)
421{
422#if IS_ENABLED(CONFIG_SMC)
423 if (static_branch_unlikely(&tcp_have_smc)) {
424 if (unlikely(OPTION_SMC & *options)) {
425 *ptr++ = htonl((TCPOPT_NOP << 24) |
426 (TCPOPT_NOP << 16) |
427 (TCPOPT_EXP << 8) |
428 (TCPOLEN_EXP_SMC_BASE));
429 *ptr++ = htonl(TCPOPT_SMC_MAGIC);
430 }
431 }
432#endif
433}
434
435struct tcp_out_options {
436 u16 options;
437 u16 mss;
438 u8 ws;
439 u8 num_sack_blocks;
440 u8 hash_size;
441 __u8 *hash_location;
442 __u32 tsval, tsecr;
443 struct tcp_fastopen_cookie *fastopen_cookie;
444 struct mptcp_out_options mptcp;
445};
446
447static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts)
448{
449#if IS_ENABLED(CONFIG_MPTCP)
450 if (unlikely(OPTION_MPTCP & opts->options))
451 mptcp_write_options(ptr, &opts->mptcp);
452#endif
453}
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
469 struct tcp_out_options *opts)
470{
471 u16 options = opts->options;
472
473 if (unlikely(OPTION_MD5 & options)) {
474 *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
475 (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
476
477 opts->hash_location = (__u8 *)ptr;
478 ptr += 4;
479 }
480
481 if (unlikely(opts->mss)) {
482 *ptr++ = htonl((TCPOPT_MSS << 24) |
483 (TCPOLEN_MSS << 16) |
484 opts->mss);
485 }
486
487 if (likely(OPTION_TS & options)) {
488 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
489 *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
490 (TCPOLEN_SACK_PERM << 16) |
491 (TCPOPT_TIMESTAMP << 8) |
492 TCPOLEN_TIMESTAMP);
493 options &= ~OPTION_SACK_ADVERTISE;
494 } else {
495 *ptr++ = htonl((TCPOPT_NOP << 24) |
496 (TCPOPT_NOP << 16) |
497 (TCPOPT_TIMESTAMP << 8) |
498 TCPOLEN_TIMESTAMP);
499 }
500 *ptr++ = htonl(opts->tsval);
501 *ptr++ = htonl(opts->tsecr);
502 }
503
504 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
505 *ptr++ = htonl((TCPOPT_NOP << 24) |
506 (TCPOPT_NOP << 16) |
507 (TCPOPT_SACK_PERM << 8) |
508 TCPOLEN_SACK_PERM);
509 }
510
511 if (unlikely(OPTION_WSCALE & options)) {
512 *ptr++ = htonl((TCPOPT_NOP << 24) |
513 (TCPOPT_WINDOW << 16) |
514 (TCPOLEN_WINDOW << 8) |
515 opts->ws);
516 }
517
518 if (unlikely(opts->num_sack_blocks)) {
519 struct tcp_sack_block *sp = tp->rx_opt.dsack ?
520 tp->duplicate_sack : tp->selective_acks;
521 int this_sack;
522
523 *ptr++ = htonl((TCPOPT_NOP << 24) |
524 (TCPOPT_NOP << 16) |
525 (TCPOPT_SACK << 8) |
526 (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
527 TCPOLEN_SACK_PERBLOCK)));
528
529 for (this_sack = 0; this_sack < opts->num_sack_blocks;
530 ++this_sack) {
531 *ptr++ = htonl(sp[this_sack].start_seq);
532 *ptr++ = htonl(sp[this_sack].end_seq);
533 }
534
535 tp->rx_opt.dsack = 0;
536 }
537
538 if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
539 struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
540 u8 *p = (u8 *)ptr;
541 u32 len;
542
543 if (foc->exp) {
544 len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
545 *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
546 TCPOPT_FASTOPEN_MAGIC);
547 p += TCPOLEN_EXP_FASTOPEN_BASE;
548 } else {
549 len = TCPOLEN_FASTOPEN_BASE + foc->len;
550 *p++ = TCPOPT_FASTOPEN;
551 *p++ = len;
552 }
553
554 memcpy(p, foc->val, foc->len);
555 if ((len & 3) == 2) {
556 p[foc->len] = TCPOPT_NOP;
557 p[foc->len + 1] = TCPOPT_NOP;
558 }
559 ptr += (len + 3) >> 2;
560 }
561
562 smc_options_write(ptr, &options);
563
564 mptcp_options_write(ptr, opts);
565}
566
567static void smc_set_option(const struct tcp_sock *tp,
568 struct tcp_out_options *opts,
569 unsigned int *remaining)
570{
571#if IS_ENABLED(CONFIG_SMC)
572 if (static_branch_unlikely(&tcp_have_smc)) {
573 if (tp->syn_smc) {
574 if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
575 opts->options |= OPTION_SMC;
576 *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
577 }
578 }
579 }
580#endif
581}
582
583static void smc_set_option_cond(const struct tcp_sock *tp,
584 const struct inet_request_sock *ireq,
585 struct tcp_out_options *opts,
586 unsigned int *remaining)
587{
588#if IS_ENABLED(CONFIG_SMC)
589 if (static_branch_unlikely(&tcp_have_smc)) {
590 if (tp->syn_smc && ireq->smc_ok) {
591 if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
592 opts->options |= OPTION_SMC;
593 *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
594 }
595 }
596 }
597#endif
598}
599
600static void mptcp_set_option_cond(const struct request_sock *req,
601 struct tcp_out_options *opts,
602 unsigned int *remaining)
603{
604 if (rsk_is_mptcp(req)) {
605 unsigned int size;
606
607 if (mptcp_synack_options(req, &size, &opts->mptcp)) {
608 if (*remaining >= size) {
609 opts->options |= OPTION_MPTCP;
610 *remaining -= size;
611 }
612 }
613 }
614}
615
616
617
618
619static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
620 struct tcp_out_options *opts,
621 struct tcp_md5sig_key **md5)
622{
623 struct tcp_sock *tp = tcp_sk(sk);
624 unsigned int remaining = MAX_TCP_OPTION_SPACE;
625 struct tcp_fastopen_request *fastopen = tp->fastopen_req;
626
627 *md5 = NULL;
628#ifdef CONFIG_TCP_MD5SIG
629 if (static_branch_unlikely(&tcp_md5_needed) &&
630 rcu_access_pointer(tp->md5sig_info)) {
631 *md5 = tp->af_specific->md5_lookup(sk, sk);
632 if (*md5) {
633 opts->options |= OPTION_MD5;
634 remaining -= TCPOLEN_MD5SIG_ALIGNED;
635 }
636 }
637#endif
638
639
640
641
642
643
644
645
646
647
648 opts->mss = tcp_advertise_mss(sk);
649 remaining -= TCPOLEN_MSS_ALIGNED;
650
651 if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) {
652 opts->options |= OPTION_TS;
653 opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
654 opts->tsecr = tp->rx_opt.ts_recent;
655 remaining -= TCPOLEN_TSTAMP_ALIGNED;
656 }
657 if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) {
658 opts->ws = tp->rx_opt.rcv_wscale;
659 opts->options |= OPTION_WSCALE;
660 remaining -= TCPOLEN_WSCALE_ALIGNED;
661 }
662 if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) {
663 opts->options |= OPTION_SACK_ADVERTISE;
664 if (unlikely(!(OPTION_TS & opts->options)))
665 remaining -= TCPOLEN_SACKPERM_ALIGNED;
666 }
667
668 if (fastopen && fastopen->cookie.len >= 0) {
669 u32 need = fastopen->cookie.len;
670
671 need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
672 TCPOLEN_FASTOPEN_BASE;
673 need = (need + 3) & ~3U;
674 if (remaining >= need) {
675 opts->options |= OPTION_FAST_OPEN_COOKIE;
676 opts->fastopen_cookie = &fastopen->cookie;
677 remaining -= need;
678 tp->syn_fastopen = 1;
679 tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
680 }
681 }
682
683 smc_set_option(tp, opts, &remaining);
684
685 if (sk_is_mptcp(sk)) {
686 unsigned int size;
687
688 if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) {
689 opts->options |= OPTION_MPTCP;
690 remaining -= size;
691 }
692 }
693
694 return MAX_TCP_OPTION_SPACE - remaining;
695}
696
697
698static unsigned int tcp_synack_options(const struct sock *sk,
699 struct request_sock *req,
700 unsigned int mss, struct sk_buff *skb,
701 struct tcp_out_options *opts,
702 const struct tcp_md5sig_key *md5,
703 struct tcp_fastopen_cookie *foc,
704 enum tcp_synack_type synack_type)
705{
706 struct inet_request_sock *ireq = inet_rsk(req);
707 unsigned int remaining = MAX_TCP_OPTION_SPACE;
708
709#ifdef CONFIG_TCP_MD5SIG
710 if (md5) {
711 opts->options |= OPTION_MD5;
712 remaining -= TCPOLEN_MD5SIG_ALIGNED;
713
714
715
716
717
718
719 if (synack_type != TCP_SYNACK_COOKIE)
720 ireq->tstamp_ok &= !ireq->sack_ok;
721 }
722#endif
723
724
725 opts->mss = mss;
726 remaining -= TCPOLEN_MSS_ALIGNED;
727
728 if (likely(ireq->wscale_ok)) {
729 opts->ws = ireq->rcv_wscale;
730 opts->options |= OPTION_WSCALE;
731 remaining -= TCPOLEN_WSCALE_ALIGNED;
732 }
733 if (likely(ireq->tstamp_ok)) {
734 opts->options |= OPTION_TS;
735 opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
736 opts->tsecr = req->ts_recent;
737 remaining -= TCPOLEN_TSTAMP_ALIGNED;
738 }
739 if (likely(ireq->sack_ok)) {
740 opts->options |= OPTION_SACK_ADVERTISE;
741 if (unlikely(!ireq->tstamp_ok))
742 remaining -= TCPOLEN_SACKPERM_ALIGNED;
743 }
744 if (foc != NULL && foc->len >= 0) {
745 u32 need = foc->len;
746
747 need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
748 TCPOLEN_FASTOPEN_BASE;
749 need = (need + 3) & ~3U;
750 if (remaining >= need) {
751 opts->options |= OPTION_FAST_OPEN_COOKIE;
752 opts->fastopen_cookie = foc;
753 remaining -= need;
754 }
755 }
756
757 mptcp_set_option_cond(req, opts, &remaining);
758
759 smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
760
761 return MAX_TCP_OPTION_SPACE - remaining;
762}
763
764
765
766
767static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
768 struct tcp_out_options *opts,
769 struct tcp_md5sig_key **md5)
770{
771 struct tcp_sock *tp = tcp_sk(sk);
772 unsigned int size = 0;
773 unsigned int eff_sacks;
774
775 opts->options = 0;
776
777 *md5 = NULL;
778#ifdef CONFIG_TCP_MD5SIG
779 if (static_branch_unlikely(&tcp_md5_needed) &&
780 rcu_access_pointer(tp->md5sig_info)) {
781 *md5 = tp->af_specific->md5_lookup(sk, sk);
782 if (*md5) {
783 opts->options |= OPTION_MD5;
784 size += TCPOLEN_MD5SIG_ALIGNED;
785 }
786 }
787#endif
788
789 if (likely(tp->rx_opt.tstamp_ok)) {
790 opts->options |= OPTION_TS;
791 opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
792 opts->tsecr = tp->rx_opt.ts_recent;
793 size += TCPOLEN_TSTAMP_ALIGNED;
794 }
795
796
797
798
799
800
801
802 if (sk_is_mptcp(sk)) {
803 unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
804 unsigned int opt_size = 0;
805
806 if (mptcp_established_options(sk, skb, &opt_size, remaining,
807 &opts->mptcp)) {
808 opts->options |= OPTION_MPTCP;
809 size += opt_size;
810 }
811 }
812
813 eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
814 if (unlikely(eff_sacks)) {
815 const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
816 if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
817 TCPOLEN_SACK_PERBLOCK))
818 return size;
819
820 opts->num_sack_blocks =
821 min_t(unsigned int, eff_sacks,
822 (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
823 TCPOLEN_SACK_PERBLOCK);
824
825 size += TCPOLEN_SACK_BASE_ALIGNED +
826 opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
827 }
828
829 return size;
830}
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847struct tsq_tasklet {
848 struct tasklet_struct tasklet;
849 struct list_head head;
850};
851static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
852
853static void tcp_tsq_write(struct sock *sk)
854{
855 if ((1 << sk->sk_state) &
856 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
857 TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) {
858 struct tcp_sock *tp = tcp_sk(sk);
859
860 if (tp->lost_out > tp->retrans_out &&
861 tp->snd_cwnd > tcp_packets_in_flight(tp)) {
862 tcp_mstamp_refresh(tp);
863 tcp_xmit_retransmit_queue(sk);
864 }
865
866 tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
867 0, GFP_ATOMIC);
868 }
869}
870
871static void tcp_tsq_handler(struct sock *sk)
872{
873 bh_lock_sock(sk);
874 if (!sock_owned_by_user(sk))
875 tcp_tsq_write(sk);
876 else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
877 sock_hold(sk);
878 bh_unlock_sock(sk);
879}
880
881
882
883
884
885
886static void tcp_tasklet_func(unsigned long data)
887{
888 struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
889 LIST_HEAD(list);
890 unsigned long flags;
891 struct list_head *q, *n;
892 struct tcp_sock *tp;
893 struct sock *sk;
894
895 local_irq_save(flags);
896 list_splice_init(&tsq->head, &list);
897 local_irq_restore(flags);
898
899 list_for_each_safe(q, n, &list) {
900 tp = list_entry(q, struct tcp_sock, tsq_node);
901 list_del(&tp->tsq_node);
902
903 sk = (struct sock *)tp;
904 smp_mb__before_atomic();
905 clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
906
907 tcp_tsq_handler(sk);
908 sk_free(sk);
909 }
910}
911
912#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
913 TCPF_WRITE_TIMER_DEFERRED | \
914 TCPF_DELACK_TIMER_DEFERRED | \
915 TCPF_MTU_REDUCED_DEFERRED)
916
917
918
919
920
921
922
923void tcp_release_cb(struct sock *sk)
924{
925 unsigned long flags, nflags;
926
927
928 do {
929 flags = sk->sk_tsq_flags;
930 if (!(flags & TCP_DEFERRED_ALL))
931 return;
932 nflags = flags & ~TCP_DEFERRED_ALL;
933 } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
934
935 if (flags & TCPF_TSQ_DEFERRED) {
936 tcp_tsq_write(sk);
937 __sock_put(sk);
938 }
939
940
941
942
943
944
945
946
947
948 sock_release_ownership(sk);
949
950 if (flags & TCPF_WRITE_TIMER_DEFERRED) {
951 tcp_write_timer_handler(sk);
952 __sock_put(sk);
953 }
954 if (flags & TCPF_DELACK_TIMER_DEFERRED) {
955 tcp_delack_timer_handler(sk);
956 __sock_put(sk);
957 }
958 if (flags & TCPF_MTU_REDUCED_DEFERRED) {
959 inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
960 __sock_put(sk);
961 }
962}
963EXPORT_SYMBOL(tcp_release_cb);
964
965void __init tcp_tasklet_init(void)
966{
967 int i;
968
969 for_each_possible_cpu(i) {
970 struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
971
972 INIT_LIST_HEAD(&tsq->head);
973 tasklet_init(&tsq->tasklet,
974 tcp_tasklet_func,
975 (unsigned long)tsq);
976 }
977}
978
979
980
981
982
983
984void tcp_wfree(struct sk_buff *skb)
985{
986 struct sock *sk = skb->sk;
987 struct tcp_sock *tp = tcp_sk(sk);
988 unsigned long flags, nval, oval;
989
990
991
992
993 WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
994
995
996
997
998
999
1000
1001
1002 if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
1003 goto out;
1004
1005 for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
1006 struct tsq_tasklet *tsq;
1007 bool empty;
1008
1009 if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
1010 goto out;
1011
1012 nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
1013 nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
1014 if (nval != oval)
1015 continue;
1016
1017
1018 local_irq_save(flags);
1019 tsq = this_cpu_ptr(&tsq_tasklet);
1020 empty = list_empty(&tsq->head);
1021 list_add(&tp->tsq_node, &tsq->head);
1022 if (empty)
1023 tasklet_schedule(&tsq->tasklet);
1024 local_irq_restore(flags);
1025 return;
1026 }
1027out:
1028 sk_free(sk);
1029}
1030
1031
1032
1033
1034enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
1035{
1036 struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
1037 struct sock *sk = (struct sock *)tp;
1038
1039 tcp_tsq_handler(sk);
1040 sock_put(sk);
1041
1042 return HRTIMER_NORESTART;
1043}
1044
1045static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
1046 u64 prior_wstamp)
1047{
1048 struct tcp_sock *tp = tcp_sk(sk);
1049
1050 if (sk->sk_pacing_status != SK_PACING_NONE) {
1051 unsigned long rate = sk->sk_pacing_rate;
1052
1053
1054
1055
1056
1057 if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
1058 u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
1059 u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
1060
1061
1062 len_ns -= min_t(u64, len_ns / 2, credit);
1063 tp->tcp_wstamp_ns += len_ns;
1064 }
1065 }
1066 list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
1067}
1068
1069INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
1070INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
1071INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb));
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
1085 int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
1086{
1087 const struct inet_connection_sock *icsk = inet_csk(sk);
1088 struct inet_sock *inet;
1089 struct tcp_sock *tp;
1090 struct tcp_skb_cb *tcb;
1091 struct tcp_out_options opts;
1092 unsigned int tcp_options_size, tcp_header_size;
1093 struct sk_buff *oskb = NULL;
1094 struct tcp_md5sig_key *md5;
1095 struct tcphdr *th;
1096 u64 prior_wstamp;
1097 int err;
1098
1099 BUG_ON(!skb || !tcp_skb_pcount(skb));
1100 tp = tcp_sk(sk);
1101 prior_wstamp = tp->tcp_wstamp_ns;
1102 tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
1103 skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
1104 if (clone_it) {
1105 TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
1106 - tp->snd_una;
1107 oskb = skb;
1108
1109 tcp_skb_tsorted_save(oskb) {
1110 if (unlikely(skb_cloned(oskb)))
1111 skb = pskb_copy(oskb, gfp_mask);
1112 else
1113 skb = skb_clone(oskb, gfp_mask);
1114 } tcp_skb_tsorted_restore(oskb);
1115
1116 if (unlikely(!skb))
1117 return -ENOBUFS;
1118
1119
1120
1121 skb->dev = NULL;
1122 }
1123
1124 inet = inet_sk(sk);
1125 tcb = TCP_SKB_CB(skb);
1126 memset(&opts, 0, sizeof(opts));
1127
1128 if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
1129 tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
1130 } else {
1131 tcp_options_size = tcp_established_options(sk, skb, &opts,
1132 &md5);
1133
1134
1135
1136
1137
1138
1139
1140
1141 if (tcp_skb_pcount(skb) > 1)
1142 tcb->tcp_flags |= TCPHDR_PSH;
1143 }
1144 tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
1145
1146
1147
1148
1149
1150
1151
1152
1153 skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
1154
1155
1156
1157
1158
1159
1160 skb->pfmemalloc = 0;
1161
1162 skb_push(skb, tcp_header_size);
1163 skb_reset_transport_header(skb);
1164
1165 skb_orphan(skb);
1166 skb->sk = sk;
1167 skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
1168 skb_set_hash_from_sk(skb, sk);
1169 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1170
1171 skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
1172
1173
1174 th = (struct tcphdr *)skb->data;
1175 th->source = inet->inet_sport;
1176 th->dest = inet->inet_dport;
1177 th->seq = htonl(tcb->seq);
1178 th->ack_seq = htonl(rcv_nxt);
1179 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
1180 tcb->tcp_flags);
1181
1182 th->check = 0;
1183 th->urg_ptr = 0;
1184
1185
1186 if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
1187 if (before(tp->snd_up, tcb->seq + 0x10000)) {
1188 th->urg_ptr = htons(tp->snd_up - tcb->seq);
1189 th->urg = 1;
1190 } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
1191 th->urg_ptr = htons(0xFFFF);
1192 th->urg = 1;
1193 }
1194 }
1195
1196 tcp_options_write((__be32 *)(th + 1), tp, &opts);
1197 skb_shinfo(skb)->gso_type = sk->sk_gso_type;
1198 if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
1199 th->window = htons(tcp_select_window(sk));
1200 tcp_ecn_send(sk, skb, th, tcp_header_size);
1201 } else {
1202
1203
1204
1205 th->window = htons(min(tp->rcv_wnd, 65535U));
1206 }
1207#ifdef CONFIG_TCP_MD5SIG
1208
1209 if (md5) {
1210 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1211 tp->af_specific->calc_md5_hash(opts.hash_location,
1212 md5, sk, skb);
1213 }
1214#endif
1215
1216 INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
1217 tcp_v6_send_check, tcp_v4_send_check,
1218 sk, skb);
1219
1220 if (likely(tcb->tcp_flags & TCPHDR_ACK))
1221 tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
1222
1223 if (skb->len != tcp_header_size) {
1224 tcp_event_data_sent(tp, sk);
1225 tp->data_segs_out += tcp_skb_pcount(skb);
1226 tp->bytes_sent += skb->len - tcp_header_size;
1227 }
1228
1229 if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
1230 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
1231 tcp_skb_pcount(skb));
1232
1233 tp->segs_out += tcp_skb_pcount(skb);
1234
1235 skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
1236 skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
1237
1238
1239
1240
1241 memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
1242 sizeof(struct inet6_skb_parm)));
1243
1244 tcp_add_tx_delay(skb, tp);
1245
1246 err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
1247 inet6_csk_xmit, ip_queue_xmit,
1248 sk, skb, &inet->cork.fl);
1249
1250 if (unlikely(err > 0)) {
1251 tcp_enter_cwr(sk);
1252 err = net_xmit_eval(err);
1253 }
1254 if (!err && oskb) {
1255 tcp_update_skb_after_send(sk, oskb, prior_wstamp);
1256 tcp_rate_skb_sent(sk, oskb);
1257 }
1258 return err;
1259}
1260
1261static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1262 gfp_t gfp_mask)
1263{
1264 return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
1265 tcp_sk(sk)->rcv_nxt);
1266}
1267
1268
1269
1270
1271
1272
1273static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
1274{
1275 struct tcp_sock *tp = tcp_sk(sk);
1276
1277
1278 WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
1279 __skb_header_release(skb);
1280 tcp_add_write_queue_tail(sk, skb);
1281 sk_wmem_queued_add(sk, skb->truesize);
1282 sk_mem_charge(sk, skb->truesize);
1283}
1284
1285
1286static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
1287{
1288 if (skb->len <= mss_now) {
1289
1290
1291
1292 tcp_skb_pcount_set(skb, 1);
1293 TCP_SKB_CB(skb)->tcp_gso_size = 0;
1294 } else {
1295 tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
1296 TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
1297 }
1298}
1299
1300
1301
1302
1303static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
1304{
1305 struct tcp_sock *tp = tcp_sk(sk);
1306
1307 tp->packets_out -= decr;
1308
1309 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1310 tp->sacked_out -= decr;
1311 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
1312 tp->retrans_out -= decr;
1313 if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
1314 tp->lost_out -= decr;
1315
1316
1317 if (tcp_is_reno(tp) && decr > 0)
1318 tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
1319
1320 if (tp->lost_skb_hint &&
1321 before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
1322 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
1323 tp->lost_cnt_hint -= decr;
1324
1325 tcp_verify_left_out(tp);
1326}
1327
1328static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
1329{
1330 return TCP_SKB_CB(skb)->txstamp_ack ||
1331 (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
1332}
1333
1334static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
1335{
1336 struct skb_shared_info *shinfo = skb_shinfo(skb);
1337
1338 if (unlikely(tcp_has_tx_tstamp(skb)) &&
1339 !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
1340 struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
1341 u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
1342
1343 shinfo->tx_flags &= ~tsflags;
1344 shinfo2->tx_flags |= tsflags;
1345 swap(shinfo->tskey, shinfo2->tskey);
1346 TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
1347 TCP_SKB_CB(skb)->txstamp_ack = 0;
1348 }
1349}
1350
1351static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
1352{
1353 TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
1354 TCP_SKB_CB(skb)->eor = 0;
1355}
1356
1357
1358static void tcp_insert_write_queue_after(struct sk_buff *skb,
1359 struct sk_buff *buff,
1360 struct sock *sk,
1361 enum tcp_queue tcp_queue)
1362{
1363 if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
1364 __skb_queue_after(&sk->sk_write_queue, skb, buff);
1365 else
1366 tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
1367}
1368
1369
1370
1371
1372
1373
1374int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
1375 struct sk_buff *skb, u32 len,
1376 unsigned int mss_now, gfp_t gfp)
1377{
1378 struct tcp_sock *tp = tcp_sk(sk);
1379 struct sk_buff *buff;
1380 int nsize, old_factor;
1381 long limit;
1382 int nlen;
1383 u8 flags;
1384
1385 if (WARN_ON(len > skb->len))
1386 return -EINVAL;
1387
1388 nsize = skb_headlen(skb) - len;
1389 if (nsize < 0)
1390 nsize = 0;
1391
1392
1393
1394
1395
1396
1397 limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_MAX_SIZE);
1398 if (unlikely((sk->sk_wmem_queued >> 1) > limit &&
1399 tcp_queue != TCP_FRAG_IN_WRITE_QUEUE &&
1400 skb != tcp_rtx_queue_head(sk) &&
1401 skb != tcp_rtx_queue_tail(sk))) {
1402 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
1403 return -ENOMEM;
1404 }
1405
1406 if (skb_unclone(skb, gfp))
1407 return -ENOMEM;
1408
1409
1410 buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
1411 if (!buff)
1412 return -ENOMEM;
1413 skb_copy_decrypted(buff, skb);
1414
1415 sk_wmem_queued_add(sk, buff->truesize);
1416 sk_mem_charge(sk, buff->truesize);
1417 nlen = skb->len - len - nsize;
1418 buff->truesize += nlen;
1419 skb->truesize -= nlen;
1420
1421
1422 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
1423 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
1424 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
1425
1426
1427 flags = TCP_SKB_CB(skb)->tcp_flags;
1428 TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
1429 TCP_SKB_CB(buff)->tcp_flags = flags;
1430 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
1431 tcp_skb_fragment_eor(skb, buff);
1432
1433 skb_split(skb, buff, len);
1434
1435 buff->ip_summed = CHECKSUM_PARTIAL;
1436
1437 buff->tstamp = skb->tstamp;
1438 tcp_fragment_tstamp(skb, buff);
1439
1440 old_factor = tcp_skb_pcount(skb);
1441
1442
1443 tcp_set_skb_tso_segs(skb, mss_now);
1444 tcp_set_skb_tso_segs(buff, mss_now);
1445
1446
1447 TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
1448
1449
1450
1451
1452 if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
1453 int diff = old_factor - tcp_skb_pcount(skb) -
1454 tcp_skb_pcount(buff);
1455
1456 if (diff)
1457 tcp_adjust_pcount(sk, skb, diff);
1458 }
1459
1460
1461 __skb_header_release(buff);
1462 tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
1463 if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
1464 list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
1465
1466 return 0;
1467}
1468
1469
1470
1471
1472static int __pskb_trim_head(struct sk_buff *skb, int len)
1473{
1474 struct skb_shared_info *shinfo;
1475 int i, k, eat;
1476
1477 eat = min_t(int, len, skb_headlen(skb));
1478 if (eat) {
1479 __skb_pull(skb, eat);
1480 len -= eat;
1481 if (!len)
1482 return 0;
1483 }
1484 eat = len;
1485 k = 0;
1486 shinfo = skb_shinfo(skb);
1487 for (i = 0; i < shinfo->nr_frags; i++) {
1488 int size = skb_frag_size(&shinfo->frags[i]);
1489
1490 if (size <= eat) {
1491 skb_frag_unref(skb, i);
1492 eat -= size;
1493 } else {
1494 shinfo->frags[k] = shinfo->frags[i];
1495 if (eat) {
1496 skb_frag_off_add(&shinfo->frags[k], eat);
1497 skb_frag_size_sub(&shinfo->frags[k], eat);
1498 eat = 0;
1499 }
1500 k++;
1501 }
1502 }
1503 shinfo->nr_frags = k;
1504
1505 skb->data_len -= len;
1506 skb->len = skb->data_len;
1507 return len;
1508}
1509
1510
1511int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
1512{
1513 u32 delta_truesize;
1514
1515 if (skb_unclone(skb, GFP_ATOMIC))
1516 return -ENOMEM;
1517
1518 delta_truesize = __pskb_trim_head(skb, len);
1519
1520 TCP_SKB_CB(skb)->seq += len;
1521 skb->ip_summed = CHECKSUM_PARTIAL;
1522
1523 if (delta_truesize) {
1524 skb->truesize -= delta_truesize;
1525 sk_wmem_queued_add(sk, -delta_truesize);
1526 sk_mem_uncharge(sk, delta_truesize);
1527 sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
1528 }
1529
1530
1531 if (tcp_skb_pcount(skb) > 1)
1532 tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
1533
1534 return 0;
1535}
1536
1537
1538static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
1539{
1540 const struct tcp_sock *tp = tcp_sk(sk);
1541 const struct inet_connection_sock *icsk = inet_csk(sk);
1542 int mss_now;
1543
1544
1545
1546
1547 mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
1548
1549
1550 if (icsk->icsk_af_ops->net_frag_header_len) {
1551 const struct dst_entry *dst = __sk_dst_get(sk);
1552
1553 if (dst && dst_allfrag(dst))
1554 mss_now -= icsk->icsk_af_ops->net_frag_header_len;
1555 }
1556
1557
1558 if (mss_now > tp->rx_opt.mss_clamp)
1559 mss_now = tp->rx_opt.mss_clamp;
1560
1561
1562 mss_now -= icsk->icsk_ext_hdr_len;
1563
1564
1565 mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss);
1566 return mss_now;
1567}
1568
1569
1570int tcp_mtu_to_mss(struct sock *sk, int pmtu)
1571{
1572
1573 return __tcp_mtu_to_mss(sk, pmtu) -
1574 (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
1575}
1576
1577
1578int tcp_mss_to_mtu(struct sock *sk, int mss)
1579{
1580 const struct tcp_sock *tp = tcp_sk(sk);
1581 const struct inet_connection_sock *icsk = inet_csk(sk);
1582 int mtu;
1583
1584 mtu = mss +
1585 tp->tcp_header_len +
1586 icsk->icsk_ext_hdr_len +
1587 icsk->icsk_af_ops->net_header_len;
1588
1589
1590 if (icsk->icsk_af_ops->net_frag_header_len) {
1591 const struct dst_entry *dst = __sk_dst_get(sk);
1592
1593 if (dst && dst_allfrag(dst))
1594 mtu += icsk->icsk_af_ops->net_frag_header_len;
1595 }
1596 return mtu;
1597}
1598EXPORT_SYMBOL(tcp_mss_to_mtu);
1599
1600
1601void tcp_mtup_init(struct sock *sk)
1602{
1603 struct tcp_sock *tp = tcp_sk(sk);
1604 struct inet_connection_sock *icsk = inet_csk(sk);
1605 struct net *net = sock_net(sk);
1606
1607 icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
1608 icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
1609 icsk->icsk_af_ops->net_header_len;
1610 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
1611 icsk->icsk_mtup.probe_size = 0;
1612 if (icsk->icsk_mtup.enabled)
1613 icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
1614}
1615EXPORT_SYMBOL(tcp_mtup_init);
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
1640{
1641 struct tcp_sock *tp = tcp_sk(sk);
1642 struct inet_connection_sock *icsk = inet_csk(sk);
1643 int mss_now;
1644
1645 if (icsk->icsk_mtup.search_high > pmtu)
1646 icsk->icsk_mtup.search_high = pmtu;
1647
1648 mss_now = tcp_mtu_to_mss(sk, pmtu);
1649 mss_now = tcp_bound_to_half_wnd(tp, mss_now);
1650
1651
1652 icsk->icsk_pmtu_cookie = pmtu;
1653 if (icsk->icsk_mtup.enabled)
1654 mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
1655 tp->mss_cache = mss_now;
1656
1657 return mss_now;
1658}
1659EXPORT_SYMBOL(tcp_sync_mss);
1660
1661
1662
1663
1664unsigned int tcp_current_mss(struct sock *sk)
1665{
1666 const struct tcp_sock *tp = tcp_sk(sk);
1667 const struct dst_entry *dst = __sk_dst_get(sk);
1668 u32 mss_now;
1669 unsigned int header_len;
1670 struct tcp_out_options opts;
1671 struct tcp_md5sig_key *md5;
1672
1673 mss_now = tp->mss_cache;
1674
1675 if (dst) {
1676 u32 mtu = dst_mtu(dst);
1677 if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
1678 mss_now = tcp_sync_mss(sk, mtu);
1679 }
1680
1681 header_len = tcp_established_options(sk, NULL, &opts, &md5) +
1682 sizeof(struct tcphdr);
1683
1684
1685
1686
1687 if (header_len != tp->tcp_header_len) {
1688 int delta = (int) header_len - tp->tcp_header_len;
1689 mss_now -= delta;
1690 }
1691
1692 return mss_now;
1693}
1694
1695
1696
1697
1698
1699static void tcp_cwnd_application_limited(struct sock *sk)
1700{
1701 struct tcp_sock *tp = tcp_sk(sk);
1702
1703 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
1704 sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
1705
1706 u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
1707 u32 win_used = max(tp->snd_cwnd_used, init_win);
1708 if (win_used < tp->snd_cwnd) {
1709 tp->snd_ssthresh = tcp_current_ssthresh(sk);
1710 tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
1711 }
1712 tp->snd_cwnd_used = 0;
1713 }
1714 tp->snd_cwnd_stamp = tcp_jiffies32;
1715}
1716
1717static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1718{
1719 const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
1720 struct tcp_sock *tp = tcp_sk(sk);
1721
1722
1723
1724
1725 if (!before(tp->snd_una, tp->max_packets_seq) ||
1726 tp->packets_out > tp->max_packets_out) {
1727 tp->max_packets_out = tp->packets_out;
1728 tp->max_packets_seq = tp->snd_nxt;
1729 tp->is_cwnd_limited = is_cwnd_limited;
1730 }
1731
1732 if (tcp_is_cwnd_limited(sk)) {
1733
1734 tp->snd_cwnd_used = 0;
1735 tp->snd_cwnd_stamp = tcp_jiffies32;
1736 } else {
1737
1738 if (tp->packets_out > tp->snd_cwnd_used)
1739 tp->snd_cwnd_used = tp->packets_out;
1740
1741 if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle &&
1742 (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
1743 !ca_ops->cong_control)
1744 tcp_cwnd_application_limited(sk);
1745
1746
1747
1748
1749
1750
1751
1752
1753 if (tcp_write_queue_empty(sk) && sk->sk_socket &&
1754 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
1755 (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1756 tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
1757 }
1758}
1759
1760
1761static bool tcp_minshall_check(const struct tcp_sock *tp)
1762{
1763 return after(tp->snd_sml, tp->snd_una) &&
1764 !after(tp->snd_sml, tp->snd_nxt);
1765}
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
1776 const struct sk_buff *skb)
1777{
1778 if (skb->len < tcp_skb_pcount(skb) * mss_now)
1779 tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
1780}
1781
1782
1783
1784
1785
1786
1787
1788
1789static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
1790 int nonagle)
1791{
1792 return partial &&
1793 ((nonagle & TCP_NAGLE_CORK) ||
1794 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1795}
1796
1797
1798
1799
1800static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
1801 int min_tso_segs)
1802{
1803 u32 bytes, segs;
1804
1805 bytes = min_t(unsigned long,
1806 sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
1807 sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
1808
1809
1810
1811
1812
1813
1814 segs = max_t(u32, bytes / mss_now, min_tso_segs);
1815
1816 return segs;
1817}
1818
1819
1820
1821
1822static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
1823{
1824 const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
1825 u32 min_tso, tso_segs;
1826
1827 min_tso = ca_ops->min_tso_segs ?
1828 ca_ops->min_tso_segs(sk) :
1829 sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs;
1830
1831 tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
1832 return min_t(u32, tso_segs, sk->sk_gso_max_segs);
1833}
1834
1835
1836static unsigned int tcp_mss_split_point(const struct sock *sk,
1837 const struct sk_buff *skb,
1838 unsigned int mss_now,
1839 unsigned int max_segs,
1840 int nonagle)
1841{
1842 const struct tcp_sock *tp = tcp_sk(sk);
1843 u32 partial, needed, window, max_len;
1844
1845 window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1846 max_len = mss_now * max_segs;
1847
1848 if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
1849 return max_len;
1850
1851 needed = min(skb->len, window);
1852
1853 if (max_len <= needed)
1854 return max_len;
1855
1856 partial = needed % mss_now;
1857
1858
1859
1860
1861 if (tcp_nagle_check(partial != 0, tp, nonagle))
1862 return needed - partial;
1863
1864 return needed;
1865}
1866
1867
1868
1869
1870static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
1871 const struct sk_buff *skb)
1872{
1873 u32 in_flight, cwnd, halfcwnd;
1874
1875
1876 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
1877 tcp_skb_pcount(skb) == 1)
1878 return 1;
1879
1880 in_flight = tcp_packets_in_flight(tp);
1881 cwnd = tp->snd_cwnd;
1882 if (in_flight >= cwnd)
1883 return 0;
1884
1885
1886
1887
1888 halfcwnd = max(cwnd >> 1, 1U);
1889 return min(halfcwnd, cwnd - in_flight);
1890}
1891
1892
1893
1894
1895
1896static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
1897{
1898 int tso_segs = tcp_skb_pcount(skb);
1899
1900 if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
1901 tcp_set_skb_tso_segs(skb, mss_now);
1902 tso_segs = tcp_skb_pcount(skb);
1903 }
1904 return tso_segs;
1905}
1906
1907
1908
1909
1910
1911static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
1912 unsigned int cur_mss, int nonagle)
1913{
1914
1915
1916
1917
1918
1919
1920 if (nonagle & TCP_NAGLE_PUSH)
1921 return true;
1922
1923
1924 if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
1925 return true;
1926
1927 if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
1928 return true;
1929
1930 return false;
1931}
1932
1933
1934static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
1935 const struct sk_buff *skb,
1936 unsigned int cur_mss)
1937{
1938 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
1939
1940 if (skb->len > cur_mss)
1941 end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
1942
1943 return !after(end_seq, tcp_wnd_end(tp));
1944}
1945
1946
1947
1948
1949
1950
1951
1952
1953static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1954 unsigned int mss_now, gfp_t gfp)
1955{
1956 int nlen = skb->len - len;
1957 struct sk_buff *buff;
1958 u8 flags;
1959
1960
1961 if (skb->len != skb->data_len)
1962 return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
1963 skb, len, mss_now, gfp);
1964
1965 buff = sk_stream_alloc_skb(sk, 0, gfp, true);
1966 if (unlikely(!buff))
1967 return -ENOMEM;
1968 skb_copy_decrypted(buff, skb);
1969
1970 sk_wmem_queued_add(sk, buff->truesize);
1971 sk_mem_charge(sk, buff->truesize);
1972 buff->truesize += nlen;
1973 skb->truesize -= nlen;
1974
1975
1976 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
1977 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
1978 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
1979
1980
1981 flags = TCP_SKB_CB(skb)->tcp_flags;
1982 TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
1983 TCP_SKB_CB(buff)->tcp_flags = flags;
1984
1985
1986 TCP_SKB_CB(buff)->sacked = 0;
1987
1988 tcp_skb_fragment_eor(skb, buff);
1989
1990 buff->ip_summed = CHECKSUM_PARTIAL;
1991 skb_split(skb, buff, len);
1992 tcp_fragment_tstamp(skb, buff);
1993
1994
1995 tcp_set_skb_tso_segs(skb, mss_now);
1996 tcp_set_skb_tso_segs(buff, mss_now);
1997
1998
1999 __skb_header_release(buff);
2000 tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE);
2001
2002 return 0;
2003}
2004
2005
2006
2007
2008
2009
2010static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
2011 bool *is_cwnd_limited,
2012 bool *is_rwnd_limited,
2013 u32 max_segs)
2014{
2015 const struct inet_connection_sock *icsk = inet_csk(sk);
2016 u32 send_win, cong_win, limit, in_flight;
2017 struct tcp_sock *tp = tcp_sk(sk);
2018 struct sk_buff *head;
2019 int win_divisor;
2020 s64 delta;
2021
2022 if (icsk->icsk_ca_state >= TCP_CA_Recovery)
2023 goto send_now;
2024
2025
2026
2027
2028
2029
2030 delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
2031 if (delta > 0)
2032 goto send_now;
2033
2034 in_flight = tcp_packets_in_flight(tp);
2035
2036 BUG_ON(tcp_skb_pcount(skb) <= 1);
2037 BUG_ON(tp->snd_cwnd <= in_flight);
2038
2039 send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
2040
2041
2042 cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
2043
2044 limit = min(send_win, cong_win);
2045
2046
2047 if (limit >= max_segs * tp->mss_cache)
2048 goto send_now;
2049
2050
2051 if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
2052 goto send_now;
2053
2054 win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
2055 if (win_divisor) {
2056 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
2057
2058
2059
2060
2061 chunk /= win_divisor;
2062 if (limit >= chunk)
2063 goto send_now;
2064 } else {
2065
2066
2067
2068
2069
2070 if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
2071 goto send_now;
2072 }
2073
2074
2075 head = tcp_rtx_queue_head(sk);
2076 if (!head)
2077 goto send_now;
2078 delta = tp->tcp_clock_cache - head->tstamp;
2079
2080 if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
2081 goto send_now;
2082
2083
2084
2085
2086
2087
2088
2089 if (cong_win < send_win) {
2090 if (cong_win <= skb->len) {
2091 *is_cwnd_limited = true;
2092 return true;
2093 }
2094 } else {
2095 if (send_win <= skb->len) {
2096 *is_rwnd_limited = true;
2097 return true;
2098 }
2099 }
2100
2101
2102 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
2103 TCP_SKB_CB(skb)->eor)
2104 goto send_now;
2105
2106 return true;
2107
2108send_now:
2109 return false;
2110}
2111
2112static inline void tcp_mtu_check_reprobe(struct sock *sk)
2113{
2114 struct inet_connection_sock *icsk = inet_csk(sk);
2115 struct tcp_sock *tp = tcp_sk(sk);
2116 struct net *net = sock_net(sk);
2117 u32 interval;
2118 s32 delta;
2119
2120 interval = net->ipv4.sysctl_tcp_probe_interval;
2121 delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
2122 if (unlikely(delta >= interval * HZ)) {
2123 int mss = tcp_current_mss(sk);
2124
2125
2126 icsk->icsk_mtup.probe_size = 0;
2127 icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
2128 sizeof(struct tcphdr) +
2129 icsk->icsk_af_ops->net_header_len;
2130 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
2131
2132
2133 icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
2134 }
2135}
2136
2137static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
2138{
2139 struct sk_buff *skb, *next;
2140
2141 skb = tcp_send_head(sk);
2142 tcp_for_write_queue_from_safe(skb, next, sk) {
2143 if (len <= skb->len)
2144 break;
2145
2146 if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb))
2147 return false;
2148
2149 len -= skb->len;
2150 }
2151
2152 return true;
2153}
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164static int tcp_mtu_probe(struct sock *sk)
2165{
2166 struct inet_connection_sock *icsk = inet_csk(sk);
2167 struct tcp_sock *tp = tcp_sk(sk);
2168 struct sk_buff *skb, *nskb, *next;
2169 struct net *net = sock_net(sk);
2170 int probe_size;
2171 int size_needed;
2172 int copy, len;
2173 int mss_now;
2174 int interval;
2175
2176
2177
2178
2179
2180
2181 if (likely(!icsk->icsk_mtup.enabled ||
2182 icsk->icsk_mtup.probe_size ||
2183 inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
2184 tp->snd_cwnd < 11 ||
2185 tp->rx_opt.num_sacks || tp->rx_opt.dsack))
2186 return -1;
2187
2188
2189
2190
2191
2192 mss_now = tcp_current_mss(sk);
2193 probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
2194 icsk->icsk_mtup.search_low) >> 1);
2195 size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
2196 interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
2197
2198
2199
2200
2201 if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
2202 interval < net->ipv4.sysctl_tcp_probe_threshold) {
2203
2204
2205
2206 tcp_mtu_check_reprobe(sk);
2207 return -1;
2208 }
2209
2210
2211 if (tp->write_seq - tp->snd_nxt < size_needed)
2212 return -1;
2213
2214 if (tp->snd_wnd < size_needed)
2215 return -1;
2216 if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
2217 return 0;
2218
2219
2220 if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
2221 if (!tcp_packets_in_flight(tp))
2222 return -1;
2223 else
2224 return 0;
2225 }
2226
2227 if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
2228 return -1;
2229
2230
2231 nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
2232 if (!nskb)
2233 return -1;
2234 sk_wmem_queued_add(sk, nskb->truesize);
2235 sk_mem_charge(sk, nskb->truesize);
2236
2237 skb = tcp_send_head(sk);
2238 skb_copy_decrypted(nskb, skb);
2239
2240 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
2241 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
2242 TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
2243 TCP_SKB_CB(nskb)->sacked = 0;
2244 nskb->csum = 0;
2245 nskb->ip_summed = CHECKSUM_PARTIAL;
2246
2247 tcp_insert_write_queue_before(nskb, skb, sk);
2248 tcp_highest_sack_replace(sk, skb, nskb);
2249
2250 len = 0;
2251 tcp_for_write_queue_from_safe(skb, next, sk) {
2252 copy = min_t(int, skb->len, probe_size - len);
2253 skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
2254
2255 if (skb->len <= copy) {
2256
2257
2258 TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2259
2260
2261
2262 TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
2263 tcp_skb_collapse_tstamp(nskb, skb);
2264 tcp_unlink_write_queue(skb, sk);
2265 sk_wmem_free_skb(sk, skb);
2266 } else {
2267 TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
2268 ~(TCPHDR_FIN|TCPHDR_PSH);
2269 if (!skb_shinfo(skb)->nr_frags) {
2270 skb_pull(skb, copy);
2271 } else {
2272 __pskb_trim_head(skb, copy);
2273 tcp_set_skb_tso_segs(skb, mss_now);
2274 }
2275 TCP_SKB_CB(skb)->seq += copy;
2276 }
2277
2278 len += copy;
2279
2280 if (len >= probe_size)
2281 break;
2282 }
2283 tcp_init_tso_segs(nskb, nskb->len);
2284
2285
2286
2287
2288 if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
2289
2290
2291 tp->snd_cwnd--;
2292 tcp_event_new_data_sent(sk, nskb);
2293
2294 icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
2295 tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
2296 tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
2297
2298 return 1;
2299 }
2300
2301 return -1;
2302}
2303
2304static bool tcp_pacing_check(struct sock *sk)
2305{
2306 struct tcp_sock *tp = tcp_sk(sk);
2307
2308 if (!tcp_needs_internal_pacing(sk))
2309 return false;
2310
2311 if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
2312 return false;
2313
2314 if (!hrtimer_is_queued(&tp->pacing_timer)) {
2315 hrtimer_start(&tp->pacing_timer,
2316 ns_to_ktime(tp->tcp_wstamp_ns),
2317 HRTIMER_MODE_ABS_PINNED_SOFT);
2318 sock_hold(sk);
2319 }
2320 return true;
2321}
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
2335 unsigned int factor)
2336{
2337 unsigned long limit;
2338
2339 limit = max_t(unsigned long,
2340 2 * skb->truesize,
2341 sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift));
2342 if (sk->sk_pacing_status == SK_PACING_NONE)
2343 limit = min_t(unsigned long, limit,
2344 sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
2345 limit <<= factor;
2346
2347 if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
2348 tcp_sk(sk)->tcp_tx_delay) {
2349 u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay;
2350
2351
2352
2353
2354
2355
2356 extra_bytes >>= (20 - 1);
2357 limit += extra_bytes;
2358 }
2359 if (refcount_read(&sk->sk_wmem_alloc) > limit) {
2360
2361
2362
2363
2364
2365 if (tcp_rtx_queue_empty(sk))
2366 return false;
2367
2368 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
2369
2370
2371
2372
2373 smp_mb__after_atomic();
2374 if (refcount_read(&sk->sk_wmem_alloc) > limit)
2375 return true;
2376 }
2377 return false;
2378}
2379
2380static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
2381{
2382 const u32 now = tcp_jiffies32;
2383 enum tcp_chrono old = tp->chrono_type;
2384
2385 if (old > TCP_CHRONO_UNSPEC)
2386 tp->chrono_stat[old - 1] += now - tp->chrono_start;
2387 tp->chrono_start = now;
2388 tp->chrono_type = new;
2389}
2390
2391void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
2392{
2393 struct tcp_sock *tp = tcp_sk(sk);
2394
2395
2396
2397
2398
2399
2400 if (type > tp->chrono_type)
2401 tcp_chrono_set(tp, type);
2402}
2403
2404void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
2405{
2406 struct tcp_sock *tp = tcp_sk(sk);
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416 if (tcp_rtx_and_write_queues_empty(sk))
2417 tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
2418 else if (type == tp->chrono_type)
2419 tcp_chrono_set(tp, TCP_CHRONO_BUSY);
2420}
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2437 int push_one, gfp_t gfp)
2438{
2439 struct tcp_sock *tp = tcp_sk(sk);
2440 struct sk_buff *skb;
2441 unsigned int tso_segs, sent_pkts;
2442 int cwnd_quota;
2443 int result;
2444 bool is_cwnd_limited = false, is_rwnd_limited = false;
2445 u32 max_segs;
2446
2447 sent_pkts = 0;
2448
2449 tcp_mstamp_refresh(tp);
2450 if (!push_one) {
2451
2452 result = tcp_mtu_probe(sk);
2453 if (!result) {
2454 return false;
2455 } else if (result > 0) {
2456 sent_pkts = 1;
2457 }
2458 }
2459
2460 max_segs = tcp_tso_segs(sk, mss_now);
2461 while ((skb = tcp_send_head(sk))) {
2462 unsigned int limit;
2463
2464 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
2465
2466 skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
2467 list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
2468 tcp_init_tso_segs(skb, mss_now);
2469 goto repair;
2470 }
2471
2472 if (tcp_pacing_check(sk))
2473 break;
2474
2475 tso_segs = tcp_init_tso_segs(skb, mss_now);
2476 BUG_ON(!tso_segs);
2477
2478 cwnd_quota = tcp_cwnd_test(tp, skb);
2479 if (!cwnd_quota) {
2480 if (push_one == 2)
2481
2482 cwnd_quota = 1;
2483 else
2484 break;
2485 }
2486
2487 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
2488 is_rwnd_limited = true;
2489 break;
2490 }
2491
2492 if (tso_segs == 1) {
2493 if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
2494 (tcp_skb_is_last(sk, skb) ?
2495 nonagle : TCP_NAGLE_PUSH))))
2496 break;
2497 } else {
2498 if (!push_one &&
2499 tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
2500 &is_rwnd_limited, max_segs))
2501 break;
2502 }
2503
2504 limit = mss_now;
2505 if (tso_segs > 1 && !tcp_urg_mode(tp))
2506 limit = tcp_mss_split_point(sk, skb, mss_now,
2507 min_t(unsigned int,
2508 cwnd_quota,
2509 max_segs),
2510 nonagle);
2511
2512 if (skb->len > limit &&
2513 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
2514 break;
2515
2516 if (tcp_small_queue_check(sk, skb, 0))
2517 break;
2518
2519
2520
2521
2522
2523
2524 if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
2525 break;
2526
2527 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
2528 break;
2529
2530repair:
2531
2532
2533
2534 tcp_event_new_data_sent(sk, skb);
2535
2536 tcp_minshall_update(tp, mss_now, skb);
2537 sent_pkts += tcp_skb_pcount(skb);
2538
2539 if (push_one)
2540 break;
2541 }
2542
2543 if (is_rwnd_limited)
2544 tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
2545 else
2546 tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
2547
2548 if (likely(sent_pkts)) {
2549 if (tcp_in_cwnd_reduction(sk))
2550 tp->prr_out += sent_pkts;
2551
2552
2553 if (push_one != 2)
2554 tcp_schedule_loss_probe(sk, false);
2555 is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
2556 tcp_cwnd_validate(sk, is_cwnd_limited);
2557 return false;
2558 }
2559 return !tp->packets_out && !tcp_write_queue_empty(sk);
2560}
2561
2562bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
2563{
2564 struct inet_connection_sock *icsk = inet_csk(sk);
2565 struct tcp_sock *tp = tcp_sk(sk);
2566 u32 timeout, rto_delta_us;
2567 int early_retrans;
2568
2569
2570
2571
2572 if (rcu_access_pointer(tp->fastopen_rsk))
2573 return false;
2574
2575 early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
2576
2577
2578
2579 if ((early_retrans != 3 && early_retrans != 4) ||
2580 !tp->packets_out || !tcp_is_sack(tp) ||
2581 (icsk->icsk_ca_state != TCP_CA_Open &&
2582 icsk->icsk_ca_state != TCP_CA_CWR))
2583 return false;
2584
2585
2586
2587
2588
2589 if (tp->srtt_us) {
2590 timeout = usecs_to_jiffies(tp->srtt_us >> 2);
2591 if (tp->packets_out == 1)
2592 timeout += TCP_RTO_MIN;
2593 else
2594 timeout += TCP_TIMEOUT_MIN;
2595 } else {
2596 timeout = TCP_TIMEOUT_INIT;
2597 }
2598
2599
2600 rto_delta_us = advancing_rto ?
2601 jiffies_to_usecs(inet_csk(sk)->icsk_rto) :
2602 tcp_rto_delta_us(sk);
2603 if (rto_delta_us > 0)
2604 timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
2605
2606 tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX);
2607 return true;
2608}
2609
2610
2611
2612
2613
2614static bool skb_still_in_host_queue(const struct sock *sk,
2615 const struct sk_buff *skb)
2616{
2617 if (unlikely(skb_fclone_busy(sk, skb))) {
2618 NET_INC_STATS(sock_net(sk),
2619 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
2620 return true;
2621 }
2622 return false;
2623}
2624
2625
2626
2627
2628void tcp_send_loss_probe(struct sock *sk)
2629{
2630 struct tcp_sock *tp = tcp_sk(sk);
2631 struct sk_buff *skb;
2632 int pcount;
2633 int mss = tcp_current_mss(sk);
2634
2635
2636 if (tp->tlp_high_seq)
2637 goto rearm_timer;
2638
2639 tp->tlp_retrans = 0;
2640 skb = tcp_send_head(sk);
2641 if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
2642 pcount = tp->packets_out;
2643 tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
2644 if (tp->packets_out > pcount)
2645 goto probe_sent;
2646 goto rearm_timer;
2647 }
2648 skb = skb_rb_last(&sk->tcp_rtx_queue);
2649 if (unlikely(!skb)) {
2650 WARN_ONCE(tp->packets_out,
2651 "invalid inflight: %u state %u cwnd %u mss %d\n",
2652 tp->packets_out, sk->sk_state, tp->snd_cwnd, mss);
2653 inet_csk(sk)->icsk_pending = 0;
2654 return;
2655 }
2656
2657 if (skb_still_in_host_queue(sk, skb))
2658 goto rearm_timer;
2659
2660 pcount = tcp_skb_pcount(skb);
2661 if (WARN_ON(!pcount))
2662 goto rearm_timer;
2663
2664 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
2665 if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
2666 (pcount - 1) * mss, mss,
2667 GFP_ATOMIC)))
2668 goto rearm_timer;
2669 skb = skb_rb_next(skb);
2670 }
2671
2672 if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
2673 goto rearm_timer;
2674
2675 if (__tcp_retransmit_skb(sk, skb, 1))
2676 goto rearm_timer;
2677
2678 tp->tlp_retrans = 1;
2679
2680probe_sent:
2681
2682 tp->tlp_high_seq = tp->snd_nxt;
2683
2684 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
2685
2686 inet_csk(sk)->icsk_pending = 0;
2687rearm_timer:
2688 tcp_rearm_rto(sk);
2689}
2690
2691
2692
2693
2694
2695void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
2696 int nonagle)
2697{
2698
2699
2700
2701
2702 if (unlikely(sk->sk_state == TCP_CLOSE))
2703 return;
2704
2705 if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
2706 sk_gfp_mask(sk, GFP_ATOMIC)))
2707 tcp_check_probe_timer(sk);
2708}
2709
2710
2711
2712
2713void tcp_push_one(struct sock *sk, unsigned int mss_now)
2714{
2715 struct sk_buff *skb = tcp_send_head(sk);
2716
2717 BUG_ON(!skb || skb->len < mss_now);
2718
2719 tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
2720}
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774u32 __tcp_select_window(struct sock *sk)
2775{
2776 struct inet_connection_sock *icsk = inet_csk(sk);
2777 struct tcp_sock *tp = tcp_sk(sk);
2778
2779
2780
2781
2782
2783
2784 int mss = icsk->icsk_ack.rcv_mss;
2785 int free_space = tcp_space(sk);
2786 int allowed_space = tcp_full_space(sk);
2787 int full_space, window;
2788
2789 if (sk_is_mptcp(sk))
2790 mptcp_space(sk, &free_space, &allowed_space);
2791
2792 full_space = min_t(int, tp->window_clamp, allowed_space);
2793
2794 if (unlikely(mss > full_space)) {
2795 mss = full_space;
2796 if (mss <= 0)
2797 return 0;
2798 }
2799 if (free_space < (full_space >> 1)) {
2800 icsk->icsk_ack.quick = 0;
2801
2802 if (tcp_under_memory_pressure(sk))
2803 tp->rcv_ssthresh = min(tp->rcv_ssthresh,
2804 4U * tp->advmss);
2805
2806
2807
2808
2809 free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
2810
2811
2812
2813
2814
2815
2816
2817
2818 if (free_space < (allowed_space >> 4) || free_space < mss)
2819 return 0;
2820 }
2821
2822 if (free_space > tp->rcv_ssthresh)
2823 free_space = tp->rcv_ssthresh;
2824
2825
2826
2827
2828 if (tp->rx_opt.rcv_wscale) {
2829 window = free_space;
2830
2831
2832
2833
2834
2835 window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
2836 } else {
2837 window = tp->rcv_wnd;
2838
2839
2840
2841
2842
2843
2844
2845
2846 if (window <= free_space - mss || window > free_space)
2847 window = rounddown(free_space, mss);
2848 else if (mss == full_space &&
2849 free_space > window + (full_space >> 1))
2850 window = free_space;
2851 }
2852
2853 return window;
2854}
2855
2856void tcp_skb_collapse_tstamp(struct sk_buff *skb,
2857 const struct sk_buff *next_skb)
2858{
2859 if (unlikely(tcp_has_tx_tstamp(next_skb))) {
2860 const struct skb_shared_info *next_shinfo =
2861 skb_shinfo(next_skb);
2862 struct skb_shared_info *shinfo = skb_shinfo(skb);
2863
2864 shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
2865 shinfo->tskey = next_shinfo->tskey;
2866 TCP_SKB_CB(skb)->txstamp_ack |=
2867 TCP_SKB_CB(next_skb)->txstamp_ack;
2868 }
2869}
2870
2871
2872static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2873{
2874 struct tcp_sock *tp = tcp_sk(sk);
2875 struct sk_buff *next_skb = skb_rb_next(skb);
2876 int next_skb_size;
2877
2878 next_skb_size = next_skb->len;
2879
2880 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
2881
2882 if (next_skb_size) {
2883 if (next_skb_size <= skb_availroom(skb))
2884 skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
2885 next_skb_size);
2886 else if (!tcp_skb_shift(skb, next_skb, 1, next_skb_size))
2887 return false;
2888 }
2889 tcp_highest_sack_replace(sk, next_skb, skb);
2890
2891
2892 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
2893
2894
2895 TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
2896
2897
2898
2899
2900 TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
2901 TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
2902
2903
2904 tcp_clear_retrans_hints_partial(tp);
2905 if (next_skb == tp->retransmit_skb_hint)
2906 tp->retransmit_skb_hint = skb;
2907
2908 tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
2909
2910 tcp_skb_collapse_tstamp(skb, next_skb);
2911
2912 tcp_rtx_queue_unlink_and_free(next_skb, sk);
2913 return true;
2914}
2915
2916
2917static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
2918{
2919 if (tcp_skb_pcount(skb) > 1)
2920 return false;
2921 if (skb_cloned(skb))
2922 return false;
2923
2924 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
2925 return false;
2926
2927 return true;
2928}
2929
2930
2931
2932
2933static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
2934 int space)
2935{
2936 struct tcp_sock *tp = tcp_sk(sk);
2937 struct sk_buff *skb = to, *tmp;
2938 bool first = true;
2939
2940 if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)
2941 return;
2942 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
2943 return;
2944
2945 skb_rbtree_walk_from_safe(skb, tmp) {
2946 if (!tcp_can_collapse(sk, skb))
2947 break;
2948
2949 if (!tcp_skb_can_collapse(to, skb))
2950 break;
2951
2952 space -= skb->len;
2953
2954 if (first) {
2955 first = false;
2956 continue;
2957 }
2958
2959 if (space < 0)
2960 break;
2961
2962 if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
2963 break;
2964
2965 if (!tcp_collapse_retrans(sk, to))
2966 break;
2967 }
2968}
2969
2970
2971
2972
2973
2974int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
2975{
2976 struct inet_connection_sock *icsk = inet_csk(sk);
2977 struct tcp_sock *tp = tcp_sk(sk);
2978 unsigned int cur_mss;
2979 int diff, len, err;
2980
2981
2982
2983 if (icsk->icsk_mtup.probe_size)
2984 icsk->icsk_mtup.probe_size = 0;
2985
2986
2987
2988
2989 if (refcount_read(&sk->sk_wmem_alloc) >
2990 min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2),
2991 sk->sk_sndbuf))
2992 return -EAGAIN;
2993
2994 if (skb_still_in_host_queue(sk, skb))
2995 return -EBUSY;
2996
2997 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
2998 if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
2999 WARN_ON_ONCE(1);
3000 return -EINVAL;
3001 }
3002 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
3003 return -ENOMEM;
3004 }
3005
3006 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
3007 return -EHOSTUNREACH;
3008
3009 cur_mss = tcp_current_mss(sk);
3010
3011
3012
3013
3014
3015
3016 if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
3017 TCP_SKB_CB(skb)->seq != tp->snd_una)
3018 return -EAGAIN;
3019
3020 len = cur_mss * segs;
3021 if (skb->len > len) {
3022 if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
3023 cur_mss, GFP_ATOMIC))
3024 return -ENOMEM;
3025 } else {
3026 if (skb_unclone(skb, GFP_ATOMIC))
3027 return -ENOMEM;
3028
3029 diff = tcp_skb_pcount(skb);
3030 tcp_set_skb_tso_segs(skb, cur_mss);
3031 diff -= tcp_skb_pcount(skb);
3032 if (diff)
3033 tcp_adjust_pcount(sk, skb, diff);
3034 if (skb->len < cur_mss)
3035 tcp_retrans_try_collapse(sk, skb, cur_mss);
3036 }
3037
3038
3039 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
3040 tcp_ecn_clear_syn(sk, skb);
3041
3042
3043 segs = tcp_skb_pcount(skb);
3044 TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
3045 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
3046 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
3047 tp->total_retrans += segs;
3048 tp->bytes_retrans += skb->len;
3049
3050
3051
3052
3053
3054 if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
3055 skb_headroom(skb) >= 0xFFFF)) {
3056 struct sk_buff *nskb;
3057
3058 tcp_skb_tsorted_save(skb) {
3059 nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
3060 if (nskb) {
3061 nskb->dev = NULL;
3062 err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC);
3063 } else {
3064 err = -ENOBUFS;
3065 }
3066 } tcp_skb_tsorted_restore(skb);
3067
3068 if (!err) {
3069 tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
3070 tcp_rate_skb_sent(sk, skb);
3071 }
3072 } else {
3073 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
3074 }
3075
3076
3077
3078
3079 TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
3080
3081 if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
3082 tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
3083 TCP_SKB_CB(skb)->seq, segs, err);
3084
3085 if (likely(!err)) {
3086 trace_tcp_retransmit_skb(sk, skb);
3087 } else if (err != -EBUSY) {
3088 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
3089 }
3090 return err;
3091}
3092
3093int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
3094{
3095 struct tcp_sock *tp = tcp_sk(sk);
3096 int err = __tcp_retransmit_skb(sk, skb, segs);
3097
3098 if (err == 0) {
3099#if FASTRETRANS_DEBUG > 0
3100 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
3101 net_dbg_ratelimited("retrans_out leaked\n");
3102 }
3103#endif
3104 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
3105 tp->retrans_out += tcp_skb_pcount(skb);
3106 }
3107
3108
3109 if (!tp->retrans_stamp)
3110 tp->retrans_stamp = tcp_skb_timestamp(skb);
3111
3112 if (tp->undo_retrans < 0)
3113 tp->undo_retrans = 0;
3114 tp->undo_retrans += tcp_skb_pcount(skb);
3115 return err;
3116}
3117
3118
3119
3120
3121
3122
3123void tcp_xmit_retransmit_queue(struct sock *sk)
3124{
3125 const struct inet_connection_sock *icsk = inet_csk(sk);
3126 struct sk_buff *skb, *rtx_head, *hole = NULL;
3127 struct tcp_sock *tp = tcp_sk(sk);
3128 bool rearm_timer = false;
3129 u32 max_segs;
3130 int mib_idx;
3131
3132 if (!tp->packets_out)
3133 return;
3134
3135 rtx_head = tcp_rtx_queue_head(sk);
3136 skb = tp->retransmit_skb_hint ?: rtx_head;
3137 max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
3138 skb_rbtree_walk_from(skb) {
3139 __u8 sacked;
3140 int segs;
3141
3142 if (tcp_pacing_check(sk))
3143 break;
3144
3145
3146 if (!hole)
3147 tp->retransmit_skb_hint = skb;
3148
3149 segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
3150 if (segs <= 0)
3151 break;
3152 sacked = TCP_SKB_CB(skb)->sacked;
3153
3154
3155
3156 segs = min_t(int, segs, max_segs);
3157
3158 if (tp->retrans_out >= tp->lost_out) {
3159 break;
3160 } else if (!(sacked & TCPCB_LOST)) {
3161 if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
3162 hole = skb;
3163 continue;
3164
3165 } else {
3166 if (icsk->icsk_ca_state != TCP_CA_Loss)
3167 mib_idx = LINUX_MIB_TCPFASTRETRANS;
3168 else
3169 mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
3170 }
3171
3172 if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
3173 continue;
3174
3175 if (tcp_small_queue_check(sk, skb, 1))
3176 break;
3177
3178 if (tcp_retransmit_skb(sk, skb, segs))
3179 break;
3180
3181 NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
3182
3183 if (tcp_in_cwnd_reduction(sk))
3184 tp->prr_out += tcp_skb_pcount(skb);
3185
3186 if (skb == rtx_head &&
3187 icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
3188 rearm_timer = true;
3189
3190 }
3191 if (rearm_timer)
3192 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
3193 inet_csk(sk)->icsk_rto,
3194 TCP_RTO_MAX);
3195}
3196
3197
3198
3199
3200
3201
3202
3203
3204void sk_forced_mem_schedule(struct sock *sk, int size)
3205{
3206 int amt;
3207
3208 if (size <= sk->sk_forward_alloc)
3209 return;
3210 amt = sk_mem_pages(size);
3211 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
3212 sk_memory_allocated_add(sk, amt);
3213
3214 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3215 mem_cgroup_charge_skmem(sk->sk_memcg, amt);
3216}
3217
3218
3219
3220
3221void tcp_send_fin(struct sock *sk)
3222{
3223 struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk);
3224 struct tcp_sock *tp = tcp_sk(sk);
3225
3226
3227
3228
3229
3230
3231 tskb = tail;
3232 if (!tskb && tcp_under_memory_pressure(sk))
3233 tskb = skb_rb_last(&sk->tcp_rtx_queue);
3234
3235 if (tskb) {
3236 TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
3237 TCP_SKB_CB(tskb)->end_seq++;
3238 tp->write_seq++;
3239 if (!tail) {
3240
3241
3242
3243
3244
3245
3246 WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1);
3247 return;
3248 }
3249 } else {
3250 skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
3251 if (unlikely(!skb))
3252 return;
3253
3254 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
3255 skb_reserve(skb, MAX_TCP_HEADER);
3256 sk_forced_mem_schedule(sk, skb->truesize);
3257
3258 tcp_init_nondata_skb(skb, tp->write_seq,
3259 TCPHDR_ACK | TCPHDR_FIN);
3260 tcp_queue_skb(sk, skb);
3261 }
3262 __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
3263}
3264
3265
3266
3267
3268
3269
3270void tcp_send_active_reset(struct sock *sk, gfp_t priority)
3271{
3272 struct sk_buff *skb;
3273
3274 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
3275
3276
3277 skb = alloc_skb(MAX_TCP_HEADER, priority);
3278 if (!skb) {
3279 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
3280 return;
3281 }
3282
3283
3284 skb_reserve(skb, MAX_TCP_HEADER);
3285 tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
3286 TCPHDR_ACK | TCPHDR_RST);
3287 tcp_mstamp_refresh(tcp_sk(sk));
3288
3289 if (tcp_transmit_skb(sk, skb, 0, priority))
3290 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
3291
3292
3293
3294
3295 trace_tcp_send_reset(sk, NULL);
3296}
3297
3298
3299
3300
3301
3302
3303
3304int tcp_send_synack(struct sock *sk)
3305{
3306 struct sk_buff *skb;
3307
3308 skb = tcp_rtx_queue_head(sk);
3309 if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
3310 pr_err("%s: wrong queue state\n", __func__);
3311 return -EFAULT;
3312 }
3313 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
3314 if (skb_cloned(skb)) {
3315 struct sk_buff *nskb;
3316
3317 tcp_skb_tsorted_save(skb) {
3318 nskb = skb_copy(skb, GFP_ATOMIC);
3319 } tcp_skb_tsorted_restore(skb);
3320 if (!nskb)
3321 return -ENOMEM;
3322 INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
3323 tcp_highest_sack_replace(sk, skb, nskb);
3324 tcp_rtx_queue_unlink_and_free(skb, sk);
3325 __skb_header_release(nskb);
3326 tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
3327 sk_wmem_queued_add(sk, nskb->truesize);
3328 sk_mem_charge(sk, nskb->truesize);
3329 skb = nskb;
3330 }
3331
3332 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
3333 tcp_ecn_send_synack(sk, skb);
3334 }
3335 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
3336}
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
3350 struct request_sock *req,
3351 struct tcp_fastopen_cookie *foc,
3352 enum tcp_synack_type synack_type)
3353{
3354 struct inet_request_sock *ireq = inet_rsk(req);
3355 const struct tcp_sock *tp = tcp_sk(sk);
3356 struct tcp_md5sig_key *md5 = NULL;
3357 struct tcp_out_options opts;
3358 struct sk_buff *skb;
3359 int tcp_header_size;
3360 struct tcphdr *th;
3361 int mss;
3362 u64 now;
3363
3364 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
3365 if (unlikely(!skb)) {
3366 dst_release(dst);
3367 return NULL;
3368 }
3369
3370 skb_reserve(skb, MAX_TCP_HEADER);
3371
3372 switch (synack_type) {
3373 case TCP_SYNACK_NORMAL:
3374 skb_set_owner_w(skb, req_to_sk(req));
3375 break;
3376 case TCP_SYNACK_COOKIE:
3377
3378
3379
3380 break;
3381 case TCP_SYNACK_FASTOPEN:
3382
3383
3384
3385
3386 skb_set_owner_w(skb, (struct sock *)sk);
3387 break;
3388 }
3389 skb_dst_set(skb, dst);
3390
3391 mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
3392
3393 memset(&opts, 0, sizeof(opts));
3394 now = tcp_clock_ns();
3395#ifdef CONFIG_SYN_COOKIES
3396 if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
3397 skb->skb_mstamp_ns = cookie_init_timestamp(req, now);
3398 else
3399#endif
3400 {
3401 skb->skb_mstamp_ns = now;
3402 if (!tcp_rsk(req)->snt_synack)
3403 tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
3404 }
3405
3406#ifdef CONFIG_TCP_MD5SIG
3407 rcu_read_lock();
3408 md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
3409#endif
3410 skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
3411 tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
3412 foc, synack_type) + sizeof(*th);
3413
3414 skb_push(skb, tcp_header_size);
3415 skb_reset_transport_header(skb);
3416
3417 th = (struct tcphdr *)skb->data;
3418 memset(th, 0, sizeof(struct tcphdr));
3419 th->syn = 1;
3420 th->ack = 1;
3421 tcp_ecn_make_synack(req, th);
3422 th->source = htons(ireq->ir_num);
3423 th->dest = ireq->ir_rmt_port;
3424 skb->mark = ireq->ir_mark;
3425 skb->ip_summed = CHECKSUM_PARTIAL;
3426 th->seq = htonl(tcp_rsk(req)->snt_isn);
3427
3428 th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
3429
3430
3431 th->window = htons(min(req->rsk_rcv_wnd, 65535U));
3432 tcp_options_write((__be32 *)(th + 1), NULL, &opts);
3433 th->doff = (tcp_header_size >> 2);
3434 __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
3435
3436#ifdef CONFIG_TCP_MD5SIG
3437
3438 if (md5)
3439 tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
3440 md5, req_to_sk(req), skb);
3441 rcu_read_unlock();
3442#endif
3443
3444 skb->skb_mstamp_ns = now;
3445 tcp_add_tx_delay(skb, tp);
3446
3447 return skb;
3448}
3449EXPORT_SYMBOL(tcp_make_synack);
3450
3451static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
3452{
3453 struct inet_connection_sock *icsk = inet_csk(sk);
3454 const struct tcp_congestion_ops *ca;
3455 u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
3456
3457 if (ca_key == TCP_CA_UNSPEC)
3458 return;
3459
3460 rcu_read_lock();
3461 ca = tcp_ca_find_key(ca_key);
3462 if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
3463 bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
3464 icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
3465 icsk->icsk_ca_ops = ca;
3466 }
3467 rcu_read_unlock();
3468}
3469
3470
3471static void tcp_connect_init(struct sock *sk)
3472{
3473 const struct dst_entry *dst = __sk_dst_get(sk);
3474 struct tcp_sock *tp = tcp_sk(sk);
3475 __u8 rcv_wscale;
3476 u32 rcv_wnd;
3477
3478
3479
3480
3481 tp->tcp_header_len = sizeof(struct tcphdr);
3482 if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
3483 tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
3484
3485#ifdef CONFIG_TCP_MD5SIG
3486 if (tp->af_specific->md5_lookup(sk, sk))
3487 tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
3488#endif
3489
3490
3491 if (tp->rx_opt.user_mss)
3492 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
3493 tp->max_window = 0;
3494 tcp_mtup_init(sk);
3495 tcp_sync_mss(sk, dst_mtu(dst));
3496
3497 tcp_ca_dst_init(sk, dst);
3498
3499 if (!tp->window_clamp)
3500 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
3501 tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
3502
3503 tcp_initialize_rcv_mss(sk);
3504
3505
3506 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
3507 (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
3508 tp->window_clamp = tcp_full_space(sk);
3509
3510 rcv_wnd = tcp_rwnd_init_bpf(sk);
3511 if (rcv_wnd == 0)
3512 rcv_wnd = dst_metric(dst, RTAX_INITRWND);
3513
3514 tcp_select_initial_window(sk, tcp_full_space(sk),
3515 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
3516 &tp->rcv_wnd,
3517 &tp->window_clamp,
3518 sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
3519 &rcv_wscale,
3520 rcv_wnd);
3521
3522 tp->rx_opt.rcv_wscale = rcv_wscale;
3523 tp->rcv_ssthresh = tp->rcv_wnd;
3524
3525 sk->sk_err = 0;
3526 sock_reset_flag(sk, SOCK_DONE);
3527 tp->snd_wnd = 0;
3528 tcp_init_wl(tp, 0);
3529 tcp_write_queue_purge(sk);
3530 tp->snd_una = tp->write_seq;
3531 tp->snd_sml = tp->write_seq;
3532 tp->snd_up = tp->write_seq;
3533 WRITE_ONCE(tp->snd_nxt, tp->write_seq);
3534
3535 if (likely(!tp->repair))
3536 tp->rcv_nxt = 0;
3537 else
3538 tp->rcv_tstamp = tcp_jiffies32;
3539 tp->rcv_wup = tp->rcv_nxt;
3540 WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
3541
3542 inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
3543 inet_csk(sk)->icsk_retransmits = 0;
3544 tcp_clear_retrans(tp);
3545}
3546
3547static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
3548{
3549 struct tcp_sock *tp = tcp_sk(sk);
3550 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
3551
3552 tcb->end_seq += skb->len;
3553 __skb_header_release(skb);
3554 sk_wmem_queued_add(sk, skb->truesize);
3555 sk_mem_charge(sk, skb->truesize);
3556 WRITE_ONCE(tp->write_seq, tcb->end_seq);
3557 tp->packets_out += tcp_skb_pcount(skb);
3558}
3559
3560
3561
3562
3563
3564
3565
3566
3567static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
3568{
3569 struct tcp_sock *tp = tcp_sk(sk);
3570 struct tcp_fastopen_request *fo = tp->fastopen_req;
3571 int space, err = 0;
3572 struct sk_buff *syn_data;
3573
3574 tp->rx_opt.mss_clamp = tp->advmss;
3575 if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
3576 goto fallback;
3577
3578
3579
3580
3581
3582 tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
3583
3584 space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
3585 MAX_TCP_OPTION_SPACE;
3586
3587 space = min_t(size_t, space, fo->size);
3588
3589
3590 space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
3591
3592 syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false);
3593 if (!syn_data)
3594 goto fallback;
3595 syn_data->ip_summed = CHECKSUM_PARTIAL;
3596 memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
3597 if (space) {
3598 int copied = copy_from_iter(skb_put(syn_data, space), space,
3599 &fo->data->msg_iter);
3600 if (unlikely(!copied)) {
3601 tcp_skb_tsorted_anchor_cleanup(syn_data);
3602 kfree_skb(syn_data);
3603 goto fallback;
3604 }
3605 if (copied != space) {
3606 skb_trim(syn_data, copied);
3607 space = copied;
3608 }
3609 skb_zcopy_set(syn_data, fo->uarg, NULL);
3610 }
3611
3612 if (space == fo->size)
3613 fo->data = NULL;
3614 fo->copied = space;
3615
3616 tcp_connect_queue_skb(sk, syn_data);
3617 if (syn_data->len)
3618 tcp_chrono_start(sk, TCP_CHRONO_BUSY);
3619
3620 err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
3621
3622 syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;
3623
3624
3625
3626
3627
3628
3629 TCP_SKB_CB(syn_data)->seq++;
3630 TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
3631 if (!err) {
3632 tp->syn_data = (fo->copied > 0);
3633 tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
3634 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
3635 goto done;
3636 }
3637
3638
3639 __skb_queue_tail(&sk->sk_write_queue, syn_data);
3640 tp->packets_out -= tcp_skb_pcount(syn_data);
3641
3642fallback:
3643
3644 if (fo->cookie.len > 0)
3645 fo->cookie.len = 0;
3646 err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
3647 if (err)
3648 tp->syn_fastopen = 0;
3649done:
3650 fo->cookie.len = -1;
3651 return err;
3652}
3653
3654
3655int tcp_connect(struct sock *sk)
3656{
3657 struct tcp_sock *tp = tcp_sk(sk);
3658 struct sk_buff *buff;
3659 int err;
3660
3661 tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
3662
3663 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
3664 return -EHOSTUNREACH;
3665
3666 tcp_connect_init(sk);
3667
3668 if (unlikely(tp->repair)) {
3669 tcp_finish_connect(sk, NULL);
3670 return 0;
3671 }
3672
3673 buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
3674 if (unlikely(!buff))
3675 return -ENOBUFS;
3676
3677 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
3678 tcp_mstamp_refresh(tp);
3679 tp->retrans_stamp = tcp_time_stamp(tp);
3680 tcp_connect_queue_skb(sk, buff);
3681 tcp_ecn_send_syn(sk, buff);
3682 tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
3683
3684
3685 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
3686 tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
3687 if (err == -ECONNREFUSED)
3688 return err;
3689
3690
3691
3692
3693 WRITE_ONCE(tp->snd_nxt, tp->write_seq);
3694 tp->pushed_seq = tp->write_seq;
3695 buff = tcp_send_head(sk);
3696 if (unlikely(buff)) {
3697 WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq);
3698 tp->pushed_seq = TCP_SKB_CB(buff)->seq;
3699 }
3700 TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
3701
3702
3703 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
3704 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
3705 return 0;
3706}
3707EXPORT_SYMBOL(tcp_connect);
3708
3709
3710
3711
3712
3713void tcp_send_delayed_ack(struct sock *sk)
3714{
3715 struct inet_connection_sock *icsk = inet_csk(sk);
3716 int ato = icsk->icsk_ack.ato;
3717 unsigned long timeout;
3718
3719 if (ato > TCP_DELACK_MIN) {
3720 const struct tcp_sock *tp = tcp_sk(sk);
3721 int max_ato = HZ / 2;
3722
3723 if (inet_csk_in_pingpong_mode(sk) ||
3724 (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
3725 max_ato = TCP_DELACK_MAX;
3726
3727
3728
3729
3730
3731
3732
3733 if (tp->srtt_us) {
3734 int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
3735 TCP_DELACK_MIN);
3736
3737 if (rtt < max_ato)
3738 max_ato = rtt;
3739 }
3740
3741 ato = min(ato, max_ato);
3742 }
3743
3744
3745 timeout = jiffies + ato;
3746
3747
3748 if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
3749
3750
3751
3752 if (icsk->icsk_ack.blocked ||
3753 time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
3754 tcp_send_ack(sk);
3755 return;
3756 }
3757
3758 if (!time_before(timeout, icsk->icsk_ack.timeout))
3759 timeout = icsk->icsk_ack.timeout;
3760 }
3761 icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
3762 icsk->icsk_ack.timeout = timeout;
3763 sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
3764}
3765
3766
3767void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
3768{
3769 struct sk_buff *buff;
3770
3771
3772 if (sk->sk_state == TCP_CLOSE)
3773 return;
3774
3775
3776
3777
3778
3779 buff = alloc_skb(MAX_TCP_HEADER,
3780 sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
3781 if (unlikely(!buff)) {
3782 inet_csk_schedule_ack(sk);
3783 inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
3784 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
3785 TCP_DELACK_MAX, TCP_RTO_MAX);
3786 return;
3787 }
3788
3789
3790 skb_reserve(buff, MAX_TCP_HEADER);
3791 tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
3792
3793
3794
3795
3796
3797 skb_set_tcp_pure_ack(buff);
3798
3799
3800 __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
3801}
3802EXPORT_SYMBOL_GPL(__tcp_send_ack);
3803
3804void tcp_send_ack(struct sock *sk)
3805{
3806 __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
3807}
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
3821{
3822 struct tcp_sock *tp = tcp_sk(sk);
3823 struct sk_buff *skb;
3824
3825
3826 skb = alloc_skb(MAX_TCP_HEADER,
3827 sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
3828 if (!skb)
3829 return -1;
3830
3831
3832 skb_reserve(skb, MAX_TCP_HEADER);
3833
3834
3835
3836
3837 tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
3838 NET_INC_STATS(sock_net(sk), mib);
3839 return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
3840}
3841
3842
3843void tcp_send_window_probe(struct sock *sk)
3844{
3845 if (sk->sk_state == TCP_ESTABLISHED) {
3846 tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
3847 tcp_mstamp_refresh(tcp_sk(sk));
3848 tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
3849 }
3850}
3851
3852
3853int tcp_write_wakeup(struct sock *sk, int mib)
3854{
3855 struct tcp_sock *tp = tcp_sk(sk);
3856 struct sk_buff *skb;
3857
3858 if (sk->sk_state == TCP_CLOSE)
3859 return -1;
3860
3861 skb = tcp_send_head(sk);
3862 if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
3863 int err;
3864 unsigned int mss = tcp_current_mss(sk);
3865 unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
3866
3867 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
3868 tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
3869
3870
3871
3872
3873
3874 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
3875 skb->len > mss) {
3876 seg_size = min(seg_size, mss);
3877 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3878 if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
3879 skb, seg_size, mss, GFP_ATOMIC))
3880 return -1;
3881 } else if (!tcp_skb_pcount(skb))
3882 tcp_set_skb_tso_segs(skb, mss);
3883
3884 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3885 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
3886 if (!err)
3887 tcp_event_new_data_sent(sk, skb);
3888 return err;
3889 } else {
3890 if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
3891 tcp_xmit_probe_skb(sk, 1, mib);
3892 return tcp_xmit_probe_skb(sk, 0, mib);
3893 }
3894}
3895
3896
3897
3898
3899void tcp_send_probe0(struct sock *sk)
3900{
3901 struct inet_connection_sock *icsk = inet_csk(sk);
3902 struct tcp_sock *tp = tcp_sk(sk);
3903 struct net *net = sock_net(sk);
3904 unsigned long timeout;
3905 int err;
3906
3907 err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
3908
3909 if (tp->packets_out || tcp_write_queue_empty(sk)) {
3910
3911 icsk->icsk_probes_out = 0;
3912 icsk->icsk_backoff = 0;
3913 return;
3914 }
3915
3916 icsk->icsk_probes_out++;
3917 if (err <= 0) {
3918 if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
3919 icsk->icsk_backoff++;
3920 timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
3921 } else {
3922
3923
3924
3925 timeout = TCP_RESOURCE_PROBE_INTERVAL;
3926 }
3927 tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX);
3928}
3929
3930int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
3931{
3932 const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
3933 struct flowi fl;
3934 int res;
3935
3936 tcp_rsk(req)->txhash = net_tx_rndhash();
3937 res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);
3938 if (!res) {
3939 __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
3940 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
3941 if (unlikely(tcp_passive_fastopen(sk)))
3942 tcp_sk(sk)->total_retrans++;
3943 trace_tcp_retransmit_synack(sk, req);
3944 }
3945 return res;
3946}
3947EXPORT_SYMBOL(tcp_rtx_synack);
3948