1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244#define pr_fmt(fmt) "TCP: " fmt
245
246#include <crypto/hash.h>
247#include <linux/kernel.h>
248#include <linux/module.h>
249#include <linux/types.h>
250#include <linux/fcntl.h>
251#include <linux/poll.h>
252#include <linux/inet_diag.h>
253#include <linux/init.h>
254#include <linux/fs.h>
255#include <linux/skbuff.h>
256#include <linux/scatterlist.h>
257#include <linux/splice.h>
258#include <linux/net.h>
259#include <linux/socket.h>
260#include <linux/random.h>
261#include <linux/memblock.h>
262#include <linux/highmem.h>
263#include <linux/cache.h>
264#include <linux/err.h>
265#include <linux/time.h>
266#include <linux/slab.h>
267#include <linux/errqueue.h>
268#include <linux/static_key.h>
269#include <linux/btf.h>
270
271#include <net/icmp.h>
272#include <net/inet_common.h>
273#include <net/tcp.h>
274#include <net/mptcp.h>
275#include <net/xfrm.h>
276#include <net/ip.h>
277#include <net/sock.h>
278
279#include <linux/uaccess.h>
280#include <asm/ioctls.h>
281#include <net/busy_poll.h>
282
283
284enum {
285 TCP_CMSG_INQ = 1,
286 TCP_CMSG_TS = 2
287};
288
289DEFINE_PER_CPU(unsigned int, tcp_orphan_count);
290EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
291
292long sysctl_tcp_mem[3] __read_mostly;
293EXPORT_SYMBOL(sysctl_tcp_mem);
294
295atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp;
296EXPORT_SYMBOL(tcp_memory_allocated);
297
298#if IS_ENABLED(CONFIG_SMC)
299DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
300EXPORT_SYMBOL(tcp_have_smc);
301#endif
302
303
304
305
306struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;
307EXPORT_SYMBOL(tcp_sockets_allocated);
308
309
310
311
312struct tcp_splice_state {
313 struct pipe_inode_info *pipe;
314 size_t len;
315 unsigned int flags;
316};
317
318
319
320
321
322
323
324unsigned long tcp_memory_pressure __read_mostly;
325EXPORT_SYMBOL_GPL(tcp_memory_pressure);
326
327void tcp_enter_memory_pressure(struct sock *sk)
328{
329 unsigned long val;
330
331 if (READ_ONCE(tcp_memory_pressure))
332 return;
333 val = jiffies;
334
335 if (!val)
336 val--;
337 if (!cmpxchg(&tcp_memory_pressure, 0, val))
338 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
339}
340EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
341
342void tcp_leave_memory_pressure(struct sock *sk)
343{
344 unsigned long val;
345
346 if (!READ_ONCE(tcp_memory_pressure))
347 return;
348 val = xchg(&tcp_memory_pressure, 0);
349 if (val)
350 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
351 jiffies_to_msecs(jiffies - val));
352}
353EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
354
355
356static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
357{
358 u8 res = 0;
359
360 if (seconds > 0) {
361 int period = timeout;
362
363 res = 1;
364 while (seconds > period && res < 255) {
365 res++;
366 timeout <<= 1;
367 if (timeout > rto_max)
368 timeout = rto_max;
369 period += timeout;
370 }
371 }
372 return res;
373}
374
375
376static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
377{
378 int period = 0;
379
380 if (retrans > 0) {
381 period = timeout;
382 while (--retrans) {
383 timeout <<= 1;
384 if (timeout > rto_max)
385 timeout = rto_max;
386 period += timeout;
387 }
388 }
389 return period;
390}
391
392static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
393{
394 u32 rate = READ_ONCE(tp->rate_delivered);
395 u32 intv = READ_ONCE(tp->rate_interval_us);
396 u64 rate64 = 0;
397
398 if (rate && intv) {
399 rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
400 do_div(rate64, intv);
401 }
402 return rate64;
403}
404
405
406
407
408
409
410void tcp_init_sock(struct sock *sk)
411{
412 struct inet_connection_sock *icsk = inet_csk(sk);
413 struct tcp_sock *tp = tcp_sk(sk);
414
415 tp->out_of_order_queue = RB_ROOT;
416 sk->tcp_rtx_queue = RB_ROOT;
417 tcp_init_xmit_timers(sk);
418 INIT_LIST_HEAD(&tp->tsq_node);
419 INIT_LIST_HEAD(&tp->tsorted_sent_queue);
420
421 icsk->icsk_rto = TCP_TIMEOUT_INIT;
422 icsk->icsk_rto_min = TCP_RTO_MIN;
423 icsk->icsk_delack_max = TCP_DELACK_MAX;
424 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
425 minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
426
427
428
429
430
431
432 tp->snd_cwnd = TCP_INIT_CWND;
433
434
435 tp->app_limited = ~0U;
436
437
438
439
440 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
441 tp->snd_cwnd_clamp = ~0;
442 tp->mss_cache = TCP_MSS_DEFAULT;
443
444 tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
445 tcp_assign_congestion_control(sk);
446
447 tp->tsoffset = 0;
448 tp->rack.reo_wnd_steps = 1;
449
450 sk->sk_write_space = sk_stream_write_space;
451 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
452
453 icsk->icsk_sync_mss = tcp_sync_mss;
454
455 WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]);
456 WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
457
458 sk_sockets_allocated_inc(sk);
459}
460EXPORT_SYMBOL(tcp_init_sock);
461
462static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
463{
464 struct sk_buff *skb = tcp_write_queue_tail(sk);
465
466 if (tsflags && skb) {
467 struct skb_shared_info *shinfo = skb_shinfo(skb);
468 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
469
470 sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
471 if (tsflags & SOF_TIMESTAMPING_TX_ACK)
472 tcb->txstamp_ack = 1;
473 if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
474 shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
475 }
476}
477
478static bool tcp_stream_is_readable(struct sock *sk, int target)
479{
480 if (tcp_epollin_ready(sk, target))
481 return true;
482 return sk_is_readable(sk);
483}
484
485
486
487
488
489
490
491
492__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
493{
494 __poll_t mask;
495 struct sock *sk = sock->sk;
496 const struct tcp_sock *tp = tcp_sk(sk);
497 int state;
498
499 sock_poll_wait(file, sock, wait);
500
501 state = inet_sk_state_load(sk);
502 if (state == TCP_LISTEN)
503 return inet_csk_listen_poll(sk);
504
505
506
507
508
509
510 mask = 0;
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539 if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
540 mask |= EPOLLHUP;
541 if (sk->sk_shutdown & RCV_SHUTDOWN)
542 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
543
544
545 if (state != TCP_SYN_SENT &&
546 (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
547 int target = sock_rcvlowat(sk, 0, INT_MAX);
548 u16 urg_data = READ_ONCE(tp->urg_data);
549
550 if (unlikely(urg_data) &&
551 READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
552 !sock_flag(sk, SOCK_URGINLINE))
553 target++;
554
555 if (tcp_stream_is_readable(sk, target))
556 mask |= EPOLLIN | EPOLLRDNORM;
557
558 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
559 if (__sk_stream_is_writeable(sk, 1)) {
560 mask |= EPOLLOUT | EPOLLWRNORM;
561 } else {
562 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
563 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
564
565
566
567
568
569
570 smp_mb__after_atomic();
571 if (__sk_stream_is_writeable(sk, 1))
572 mask |= EPOLLOUT | EPOLLWRNORM;
573 }
574 } else
575 mask |= EPOLLOUT | EPOLLWRNORM;
576
577 if (urg_data & TCP_URG_VALID)
578 mask |= EPOLLPRI;
579 } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
580
581
582
583
584 mask |= EPOLLOUT | EPOLLWRNORM;
585 }
586
587 smp_rmb();
588 if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
589 mask |= EPOLLERR;
590
591 return mask;
592}
593EXPORT_SYMBOL(tcp_poll);
594
595int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
596{
597 struct tcp_sock *tp = tcp_sk(sk);
598 int answ;
599 bool slow;
600
601 switch (cmd) {
602 case SIOCINQ:
603 if (sk->sk_state == TCP_LISTEN)
604 return -EINVAL;
605
606 slow = lock_sock_fast(sk);
607 answ = tcp_inq(sk);
608 unlock_sock_fast(sk, slow);
609 break;
610 case SIOCATMARK:
611 answ = READ_ONCE(tp->urg_data) &&
612 READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
613 break;
614 case SIOCOUTQ:
615 if (sk->sk_state == TCP_LISTEN)
616 return -EINVAL;
617
618 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
619 answ = 0;
620 else
621 answ = READ_ONCE(tp->write_seq) - tp->snd_una;
622 break;
623 case SIOCOUTQNSD:
624 if (sk->sk_state == TCP_LISTEN)
625 return -EINVAL;
626
627 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
628 answ = 0;
629 else
630 answ = READ_ONCE(tp->write_seq) -
631 READ_ONCE(tp->snd_nxt);
632 break;
633 default:
634 return -ENOIOCTLCMD;
635 }
636
637 return put_user(answ, (int __user *)arg);
638}
639EXPORT_SYMBOL(tcp_ioctl);
640
641void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
642{
643 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
644 tp->pushed_seq = tp->write_seq;
645}
646
647static inline bool forced_push(const struct tcp_sock *tp)
648{
649 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
650}
651
652void tcp_skb_entail(struct sock *sk, struct sk_buff *skb)
653{
654 struct tcp_sock *tp = tcp_sk(sk);
655 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
656
657 tcb->seq = tcb->end_seq = tp->write_seq;
658 tcb->tcp_flags = TCPHDR_ACK;
659 __skb_header_release(skb);
660 tcp_add_write_queue_tail(sk, skb);
661 sk_wmem_queued_add(sk, skb->truesize);
662 sk_mem_charge(sk, skb->truesize);
663 if (tp->nonagle & TCP_NAGLE_PUSH)
664 tp->nonagle &= ~TCP_NAGLE_PUSH;
665
666 tcp_slow_start_after_idle_check(sk);
667}
668
669static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
670{
671 if (flags & MSG_OOB)
672 tp->snd_up = tp->write_seq;
673}
674
675
676
677
678
679
680
681
682
683
684
685static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
686 int size_goal)
687{
688 return skb->len < size_goal &&
689 sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
690 !tcp_rtx_queue_empty(sk) &&
691 refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
692}
693
694void tcp_push(struct sock *sk, int flags, int mss_now,
695 int nonagle, int size_goal)
696{
697 struct tcp_sock *tp = tcp_sk(sk);
698 struct sk_buff *skb;
699
700 skb = tcp_write_queue_tail(sk);
701 if (!skb)
702 return;
703 if (!(flags & MSG_MORE) || forced_push(tp))
704 tcp_mark_push(tp, skb);
705
706 tcp_mark_urg(tp, flags);
707
708 if (tcp_should_autocork(sk, skb, size_goal)) {
709
710
711 if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
712 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
713 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
714 }
715
716
717
718 if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
719 return;
720 }
721
722 if (flags & MSG_MORE)
723 nonagle = TCP_NAGLE_CORK;
724
725 __tcp_push_pending_frames(sk, mss_now, nonagle);
726}
727
728static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
729 unsigned int offset, size_t len)
730{
731 struct tcp_splice_state *tss = rd_desc->arg.data;
732 int ret;
733
734 ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
735 min(rd_desc->count, len), tss->flags);
736 if (ret > 0)
737 rd_desc->count -= ret;
738 return ret;
739}
740
741static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
742{
743
744 read_descriptor_t rd_desc = {
745 .arg.data = tss,
746 .count = tss->len,
747 };
748
749 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
750}
751
752
753
754
755
756
757
758
759
760
761
762
763
764ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
765 struct pipe_inode_info *pipe, size_t len,
766 unsigned int flags)
767{
768 struct sock *sk = sock->sk;
769 struct tcp_splice_state tss = {
770 .pipe = pipe,
771 .len = len,
772 .flags = flags,
773 };
774 long timeo;
775 ssize_t spliced;
776 int ret;
777
778 sock_rps_record_flow(sk);
779
780
781
782 if (unlikely(*ppos))
783 return -ESPIPE;
784
785 ret = spliced = 0;
786
787 lock_sock(sk);
788
789 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
790 while (tss.len) {
791 ret = __tcp_splice_read(sk, &tss);
792 if (ret < 0)
793 break;
794 else if (!ret) {
795 if (spliced)
796 break;
797 if (sock_flag(sk, SOCK_DONE))
798 break;
799 if (sk->sk_err) {
800 ret = sock_error(sk);
801 break;
802 }
803 if (sk->sk_shutdown & RCV_SHUTDOWN)
804 break;
805 if (sk->sk_state == TCP_CLOSE) {
806
807
808
809
810 ret = -ENOTCONN;
811 break;
812 }
813 if (!timeo) {
814 ret = -EAGAIN;
815 break;
816 }
817
818
819
820
821 if (!skb_queue_empty(&sk->sk_receive_queue))
822 break;
823 sk_wait_data(sk, &timeo, NULL);
824 if (signal_pending(current)) {
825 ret = sock_intr_errno(timeo);
826 break;
827 }
828 continue;
829 }
830 tss.len -= ret;
831 spliced += ret;
832
833 if (!timeo)
834 break;
835 release_sock(sk);
836 lock_sock(sk);
837
838 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
839 (sk->sk_shutdown & RCV_SHUTDOWN) ||
840 signal_pending(current))
841 break;
842 }
843
844 release_sock(sk);
845 sk_defer_free_flush(sk);
846
847 if (spliced)
848 return spliced;
849
850 return ret;
851}
852EXPORT_SYMBOL(tcp_splice_read);
853
854struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
855 bool force_schedule)
856{
857 struct sk_buff *skb;
858
859 if (unlikely(tcp_under_memory_pressure(sk)))
860 sk_mem_reclaim_partial(sk);
861
862 skb = alloc_skb_fclone(size + MAX_TCP_HEADER, gfp);
863 if (likely(skb)) {
864 bool mem_scheduled;
865
866 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
867 if (force_schedule) {
868 mem_scheduled = true;
869 sk_forced_mem_schedule(sk, skb->truesize);
870 } else {
871 mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
872 }
873 if (likely(mem_scheduled)) {
874 skb_reserve(skb, MAX_TCP_HEADER);
875 skb->ip_summed = CHECKSUM_PARTIAL;
876 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
877 return skb;
878 }
879 __kfree_skb(skb);
880 } else {
881 sk->sk_prot->enter_memory_pressure(sk);
882 sk_stream_moderate_sndbuf(sk);
883 }
884 return NULL;
885}
886
887static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
888 int large_allowed)
889{
890 struct tcp_sock *tp = tcp_sk(sk);
891 u32 new_size_goal, size_goal;
892
893 if (!large_allowed)
894 return mss_now;
895
896
897 new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
898 new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
899
900
901 size_goal = tp->gso_segs * mss_now;
902 if (unlikely(new_size_goal < size_goal ||
903 new_size_goal >= size_goal + mss_now)) {
904 tp->gso_segs = min_t(u16, new_size_goal / mss_now,
905 sk->sk_gso_max_segs);
906 size_goal = tp->gso_segs * mss_now;
907 }
908
909 return max(size_goal, mss_now);
910}
911
912int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
913{
914 int mss_now;
915
916 mss_now = tcp_current_mss(sk);
917 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
918
919 return mss_now;
920}
921
922
923
924
925
926
927
928void tcp_remove_empty_skb(struct sock *sk)
929{
930 struct sk_buff *skb = tcp_write_queue_tail(sk);
931
932 if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
933 tcp_unlink_write_queue(skb, sk);
934 if (tcp_write_queue_empty(sk))
935 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
936 tcp_wmem_free_skb(sk, skb);
937 }
938}
939
940
941static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb)
942{
943 if (unlikely(skb_zcopy_pure(skb))) {
944 u32 extra = skb->truesize -
945 SKB_TRUESIZE(skb_end_offset(skb));
946
947 if (!sk_wmem_schedule(sk, extra))
948 return -ENOMEM;
949
950 sk_mem_charge(sk, extra);
951 skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY;
952 }
953 return 0;
954}
955
956static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
957 struct page *page, int offset, size_t *size)
958{
959 struct sk_buff *skb = tcp_write_queue_tail(sk);
960 struct tcp_sock *tp = tcp_sk(sk);
961 bool can_coalesce;
962 int copy, i;
963
964 if (!skb || (copy = size_goal - skb->len) <= 0 ||
965 !tcp_skb_can_collapse_to(skb)) {
966new_segment:
967 if (!sk_stream_memory_free(sk))
968 return NULL;
969
970 skb = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation,
971 tcp_rtx_and_write_queues_empty(sk));
972 if (!skb)
973 return NULL;
974
975#ifdef CONFIG_TLS_DEVICE
976 skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
977#endif
978 tcp_skb_entail(sk, skb);
979 copy = size_goal;
980 }
981
982 if (copy > *size)
983 copy = *size;
984
985 i = skb_shinfo(skb)->nr_frags;
986 can_coalesce = skb_can_coalesce(skb, i, page, offset);
987 if (!can_coalesce && i >= sysctl_max_skb_frags) {
988 tcp_mark_push(tp, skb);
989 goto new_segment;
990 }
991 if (tcp_downgrade_zcopy_pure(sk, skb) || !sk_wmem_schedule(sk, copy))
992 return NULL;
993
994 if (can_coalesce) {
995 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
996 } else {
997 get_page(page);
998 skb_fill_page_desc(skb, i, page, offset, copy);
999 }
1000
1001 if (!(flags & MSG_NO_SHARED_FRAGS))
1002 skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
1003
1004 skb->len += copy;
1005 skb->data_len += copy;
1006 skb->truesize += copy;
1007 sk_wmem_queued_add(sk, copy);
1008 sk_mem_charge(sk, copy);
1009 WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
1010 TCP_SKB_CB(skb)->end_seq += copy;
1011 tcp_skb_pcount_set(skb, 0);
1012
1013 *size = copy;
1014 return skb;
1015}
1016
1017ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
1018 size_t size, int flags)
1019{
1020 struct tcp_sock *tp = tcp_sk(sk);
1021 int mss_now, size_goal;
1022 int err;
1023 ssize_t copied;
1024 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1025
1026 if (IS_ENABLED(CONFIG_DEBUG_VM) &&
1027 WARN_ONCE(!sendpage_ok(page),
1028 "page must not be a Slab one and have page_count > 0"))
1029 return -EINVAL;
1030
1031
1032
1033
1034
1035 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1036 !tcp_passive_fastopen(sk)) {
1037 err = sk_stream_wait_connect(sk, &timeo);
1038 if (err != 0)
1039 goto out_err;
1040 }
1041
1042 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1043
1044 mss_now = tcp_send_mss(sk, &size_goal, flags);
1045 copied = 0;
1046
1047 err = -EPIPE;
1048 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1049 goto out_err;
1050
1051 while (size > 0) {
1052 struct sk_buff *skb;
1053 size_t copy = size;
1054
1055 skb = tcp_build_frag(sk, size_goal, flags, page, offset, ©);
1056 if (!skb)
1057 goto wait_for_space;
1058
1059 if (!copied)
1060 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1061
1062 copied += copy;
1063 offset += copy;
1064 size -= copy;
1065 if (!size)
1066 goto out;
1067
1068 if (skb->len < size_goal || (flags & MSG_OOB))
1069 continue;
1070
1071 if (forced_push(tp)) {
1072 tcp_mark_push(tp, skb);
1073 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1074 } else if (skb == tcp_send_head(sk))
1075 tcp_push_one(sk, mss_now);
1076 continue;
1077
1078wait_for_space:
1079 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1080 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1081 TCP_NAGLE_PUSH, size_goal);
1082
1083 err = sk_stream_wait_memory(sk, &timeo);
1084 if (err != 0)
1085 goto do_error;
1086
1087 mss_now = tcp_send_mss(sk, &size_goal, flags);
1088 }
1089
1090out:
1091 if (copied) {
1092 tcp_tx_timestamp(sk, sk->sk_tsflags);
1093 if (!(flags & MSG_SENDPAGE_NOTLAST))
1094 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1095 }
1096 return copied;
1097
1098do_error:
1099 tcp_remove_empty_skb(sk);
1100 if (copied)
1101 goto out;
1102out_err:
1103
1104 if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
1105 sk->sk_write_space(sk);
1106 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1107 }
1108 return sk_stream_error(sk, flags, err);
1109}
1110EXPORT_SYMBOL_GPL(do_tcp_sendpages);
1111
1112int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
1113 size_t size, int flags)
1114{
1115 if (!(sk->sk_route_caps & NETIF_F_SG))
1116 return sock_no_sendpage_locked(sk, page, offset, size, flags);
1117
1118 tcp_rate_check_app_limited(sk);
1119
1120 return do_tcp_sendpages(sk, page, offset, size, flags);
1121}
1122EXPORT_SYMBOL_GPL(tcp_sendpage_locked);
1123
1124int tcp_sendpage(struct sock *sk, struct page *page, int offset,
1125 size_t size, int flags)
1126{
1127 int ret;
1128
1129 lock_sock(sk);
1130 ret = tcp_sendpage_locked(sk, page, offset, size, flags);
1131 release_sock(sk);
1132
1133 return ret;
1134}
1135EXPORT_SYMBOL(tcp_sendpage);
1136
1137void tcp_free_fastopen_req(struct tcp_sock *tp)
1138{
1139 if (tp->fastopen_req) {
1140 kfree(tp->fastopen_req);
1141 tp->fastopen_req = NULL;
1142 }
1143}
1144
1145static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1146 int *copied, size_t size,
1147 struct ubuf_info *uarg)
1148{
1149 struct tcp_sock *tp = tcp_sk(sk);
1150 struct inet_sock *inet = inet_sk(sk);
1151 struct sockaddr *uaddr = msg->msg_name;
1152 int err, flags;
1153
1154 if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
1155 (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
1156 uaddr->sa_family == AF_UNSPEC))
1157 return -EOPNOTSUPP;
1158 if (tp->fastopen_req)
1159 return -EALREADY;
1160
1161 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1162 sk->sk_allocation);
1163 if (unlikely(!tp->fastopen_req))
1164 return -ENOBUFS;
1165 tp->fastopen_req->data = msg;
1166 tp->fastopen_req->size = size;
1167 tp->fastopen_req->uarg = uarg;
1168
1169 if (inet->defer_connect) {
1170 err = tcp_connect(sk);
1171
1172 if (err) {
1173 tcp_set_state(sk, TCP_CLOSE);
1174 inet->inet_dport = 0;
1175 sk->sk_route_caps = 0;
1176 }
1177 }
1178 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1179 err = __inet_stream_connect(sk->sk_socket, uaddr,
1180 msg->msg_namelen, flags, 1);
1181
1182
1183
1184 if (tp->fastopen_req) {
1185 *copied = tp->fastopen_req->copied;
1186 tcp_free_fastopen_req(tp);
1187 inet->defer_connect = 0;
1188 }
1189 return err;
1190}
1191
1192int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
1193{
1194 struct tcp_sock *tp = tcp_sk(sk);
1195 struct ubuf_info *uarg = NULL;
1196 struct sk_buff *skb;
1197 struct sockcm_cookie sockc;
1198 int flags, err, copied = 0;
1199 int mss_now = 0, size_goal, copied_syn = 0;
1200 int process_backlog = 0;
1201 bool zc = false;
1202 long timeo;
1203
1204 flags = msg->msg_flags;
1205
1206 if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
1207 skb = tcp_write_queue_tail(sk);
1208 uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
1209 if (!uarg) {
1210 err = -ENOBUFS;
1211 goto out_err;
1212 }
1213
1214 zc = sk->sk_route_caps & NETIF_F_SG;
1215 if (!zc)
1216 uarg->zerocopy = 0;
1217 }
1218
1219 if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
1220 !tp->repair) {
1221 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
1222 if (err == -EINPROGRESS && copied_syn > 0)
1223 goto out;
1224 else if (err)
1225 goto out_err;
1226 }
1227
1228 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1229
1230 tcp_rate_check_app_limited(sk);
1231
1232
1233
1234
1235
1236 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1237 !tcp_passive_fastopen(sk)) {
1238 err = sk_stream_wait_connect(sk, &timeo);
1239 if (err != 0)
1240 goto do_error;
1241 }
1242
1243 if (unlikely(tp->repair)) {
1244 if (tp->repair_queue == TCP_RECV_QUEUE) {
1245 copied = tcp_send_rcvq(sk, msg, size);
1246 goto out_nopush;
1247 }
1248
1249 err = -EINVAL;
1250 if (tp->repair_queue == TCP_NO_QUEUE)
1251 goto out_err;
1252
1253
1254 }
1255
1256 sockcm_init(&sockc, sk);
1257 if (msg->msg_controllen) {
1258 err = sock_cmsg_send(sk, msg, &sockc);
1259 if (unlikely(err)) {
1260 err = -EINVAL;
1261 goto out_err;
1262 }
1263 }
1264
1265
1266 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1267
1268
1269 copied = 0;
1270
1271restart:
1272 mss_now = tcp_send_mss(sk, &size_goal, flags);
1273
1274 err = -EPIPE;
1275 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1276 goto do_error;
1277
1278 while (msg_data_left(msg)) {
1279 int copy = 0;
1280
1281 skb = tcp_write_queue_tail(sk);
1282 if (skb)
1283 copy = size_goal - skb->len;
1284
1285 if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
1286 bool first_skb;
1287
1288new_segment:
1289 if (!sk_stream_memory_free(sk))
1290 goto wait_for_space;
1291
1292 if (unlikely(process_backlog >= 16)) {
1293 process_backlog = 0;
1294 if (sk_flush_backlog(sk))
1295 goto restart;
1296 }
1297 first_skb = tcp_rtx_and_write_queues_empty(sk);
1298 skb = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation,
1299 first_skb);
1300 if (!skb)
1301 goto wait_for_space;
1302
1303 process_backlog++;
1304
1305 tcp_skb_entail(sk, skb);
1306 copy = size_goal;
1307
1308
1309
1310
1311
1312 if (tp->repair)
1313 TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
1314 }
1315
1316
1317 if (copy > msg_data_left(msg))
1318 copy = msg_data_left(msg);
1319
1320 if (!zc) {
1321 bool merge = true;
1322 int i = skb_shinfo(skb)->nr_frags;
1323 struct page_frag *pfrag = sk_page_frag(sk);
1324
1325 if (!sk_page_frag_refill(sk, pfrag))
1326 goto wait_for_space;
1327
1328 if (!skb_can_coalesce(skb, i, pfrag->page,
1329 pfrag->offset)) {
1330 if (i >= sysctl_max_skb_frags) {
1331 tcp_mark_push(tp, skb);
1332 goto new_segment;
1333 }
1334 merge = false;
1335 }
1336
1337 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1338
1339 if (tcp_downgrade_zcopy_pure(sk, skb) ||
1340 !sk_wmem_schedule(sk, copy))
1341 goto wait_for_space;
1342
1343 err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
1344 pfrag->page,
1345 pfrag->offset,
1346 copy);
1347 if (err)
1348 goto do_error;
1349
1350
1351 if (merge) {
1352 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1353 } else {
1354 skb_fill_page_desc(skb, i, pfrag->page,
1355 pfrag->offset, copy);
1356 page_ref_inc(pfrag->page);
1357 }
1358 pfrag->offset += copy;
1359 } else {
1360
1361
1362
1363 if (!skb->len)
1364 skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY;
1365
1366 if (!skb_zcopy_pure(skb)) {
1367 if (!sk_wmem_schedule(sk, copy))
1368 goto wait_for_space;
1369 }
1370
1371 err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
1372 if (err == -EMSGSIZE || err == -EEXIST) {
1373 tcp_mark_push(tp, skb);
1374 goto new_segment;
1375 }
1376 if (err < 0)
1377 goto do_error;
1378 copy = err;
1379 }
1380
1381 if (!copied)
1382 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1383
1384 WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
1385 TCP_SKB_CB(skb)->end_seq += copy;
1386 tcp_skb_pcount_set(skb, 0);
1387
1388 copied += copy;
1389 if (!msg_data_left(msg)) {
1390 if (unlikely(flags & MSG_EOR))
1391 TCP_SKB_CB(skb)->eor = 1;
1392 goto out;
1393 }
1394
1395 if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair))
1396 continue;
1397
1398 if (forced_push(tp)) {
1399 tcp_mark_push(tp, skb);
1400 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1401 } else if (skb == tcp_send_head(sk))
1402 tcp_push_one(sk, mss_now);
1403 continue;
1404
1405wait_for_space:
1406 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1407 if (copied)
1408 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1409 TCP_NAGLE_PUSH, size_goal);
1410
1411 err = sk_stream_wait_memory(sk, &timeo);
1412 if (err != 0)
1413 goto do_error;
1414
1415 mss_now = tcp_send_mss(sk, &size_goal, flags);
1416 }
1417
1418out:
1419 if (copied) {
1420 tcp_tx_timestamp(sk, sockc.tsflags);
1421 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1422 }
1423out_nopush:
1424 net_zcopy_put(uarg);
1425 return copied + copied_syn;
1426
1427do_error:
1428 tcp_remove_empty_skb(sk);
1429
1430 if (copied + copied_syn)
1431 goto out;
1432out_err:
1433 net_zcopy_put_abort(uarg, true);
1434 err = sk_stream_error(sk, flags, err);
1435
1436 if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
1437 sk->sk_write_space(sk);
1438 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1439 }
1440 return err;
1441}
1442EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
1443
1444int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
1445{
1446 int ret;
1447
1448 lock_sock(sk);
1449 ret = tcp_sendmsg_locked(sk, msg, size);
1450 release_sock(sk);
1451
1452 return ret;
1453}
1454EXPORT_SYMBOL(tcp_sendmsg);
1455
1456
1457
1458
1459
1460
1461static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1462{
1463 struct tcp_sock *tp = tcp_sk(sk);
1464
1465
1466 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1467 tp->urg_data == TCP_URG_READ)
1468 return -EINVAL;
1469
1470 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1471 return -ENOTCONN;
1472
1473 if (tp->urg_data & TCP_URG_VALID) {
1474 int err = 0;
1475 char c = tp->urg_data;
1476
1477 if (!(flags & MSG_PEEK))
1478 WRITE_ONCE(tp->urg_data, TCP_URG_READ);
1479
1480
1481 msg->msg_flags |= MSG_OOB;
1482
1483 if (len > 0) {
1484 if (!(flags & MSG_TRUNC))
1485 err = memcpy_to_msg(msg, &c, 1);
1486 len = 1;
1487 } else
1488 msg->msg_flags |= MSG_TRUNC;
1489
1490 return err ? -EFAULT : len;
1491 }
1492
1493 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1494 return 0;
1495
1496
1497
1498
1499
1500
1501
1502 return -EAGAIN;
1503}
1504
1505static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1506{
1507 struct sk_buff *skb;
1508 int copied = 0, err = 0;
1509
1510
1511
1512 skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
1513 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1514 if (err)
1515 return err;
1516 copied += skb->len;
1517 }
1518
1519 skb_queue_walk(&sk->sk_write_queue, skb) {
1520 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1521 if (err)
1522 break;
1523
1524 copied += skb->len;
1525 }
1526
1527 return err ?: copied;
1528}
1529
1530
1531
1532
1533
1534
1535
1536void tcp_cleanup_rbuf(struct sock *sk, int copied)
1537{
1538 struct tcp_sock *tp = tcp_sk(sk);
1539 bool time_to_ack = false;
1540
1541 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1542
1543 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1544 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1545 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1546
1547 if (inet_csk_ack_scheduled(sk)) {
1548 const struct inet_connection_sock *icsk = inet_csk(sk);
1549
1550 if (
1551 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1552
1553
1554
1555
1556
1557
1558 (copied > 0 &&
1559 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1560 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1561 !inet_csk_in_pingpong_mode(sk))) &&
1562 !atomic_read(&sk->sk_rmem_alloc)))
1563 time_to_ack = true;
1564 }
1565
1566
1567
1568
1569
1570
1571
1572 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1573 __u32 rcv_window_now = tcp_receive_window(tp);
1574
1575
1576 if (2*rcv_window_now <= tp->window_clamp) {
1577 __u32 new_window = __tcp_select_window(sk);
1578
1579
1580
1581
1582
1583
1584 if (new_window && new_window >= 2 * rcv_window_now)
1585 time_to_ack = true;
1586 }
1587 }
1588 if (time_to_ack)
1589 tcp_send_ack(sk);
1590}
1591
1592void __sk_defer_free_flush(struct sock *sk)
1593{
1594 struct llist_node *head;
1595 struct sk_buff *skb, *n;
1596
1597 head = llist_del_all(&sk->defer_list);
1598 llist_for_each_entry_safe(skb, n, head, ll_node) {
1599 prefetch(n);
1600 skb_mark_not_on_list(skb);
1601 __kfree_skb(skb);
1602 }
1603}
1604EXPORT_SYMBOL(__sk_defer_free_flush);
1605
1606static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
1607{
1608 __skb_unlink(skb, &sk->sk_receive_queue);
1609 if (likely(skb->destructor == sock_rfree)) {
1610 sock_rfree(skb);
1611 skb->destructor = NULL;
1612 skb->sk = NULL;
1613 if (!skb_queue_empty(&sk->sk_receive_queue) ||
1614 !llist_empty(&sk->defer_list)) {
1615 llist_add(&skb->ll_node, &sk->defer_list);
1616 return;
1617 }
1618 }
1619 __kfree_skb(skb);
1620}
1621
1622static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1623{
1624 struct sk_buff *skb;
1625 u32 offset;
1626
1627 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1628 offset = seq - TCP_SKB_CB(skb)->seq;
1629 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
1630 pr_err_once("%s: found a SYN, please report !\n", __func__);
1631 offset--;
1632 }
1633 if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
1634 *off = offset;
1635 return skb;
1636 }
1637
1638
1639
1640
1641 tcp_eat_recv_skb(sk, skb);
1642 }
1643 return NULL;
1644}
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1658 sk_read_actor_t recv_actor)
1659{
1660 struct sk_buff *skb;
1661 struct tcp_sock *tp = tcp_sk(sk);
1662 u32 seq = tp->copied_seq;
1663 u32 offset;
1664 int copied = 0;
1665
1666 if (sk->sk_state == TCP_LISTEN)
1667 return -ENOTCONN;
1668 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1669 if (offset < skb->len) {
1670 int used;
1671 size_t len;
1672
1673 len = skb->len - offset;
1674
1675 if (unlikely(tp->urg_data)) {
1676 u32 urg_offset = tp->urg_seq - seq;
1677 if (urg_offset < len)
1678 len = urg_offset;
1679 if (!len)
1680 break;
1681 }
1682 used = recv_actor(desc, skb, offset, len);
1683 if (used <= 0) {
1684 if (!copied)
1685 copied = used;
1686 break;
1687 }
1688 if (WARN_ON_ONCE(used > len))
1689 used = len;
1690 seq += used;
1691 copied += used;
1692 offset += used;
1693
1694
1695
1696
1697
1698
1699 skb = tcp_recv_skb(sk, seq - 1, &offset);
1700 if (!skb)
1701 break;
1702
1703
1704
1705 if (offset + 1 != skb->len)
1706 continue;
1707 }
1708 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1709 tcp_eat_recv_skb(sk, skb);
1710 ++seq;
1711 break;
1712 }
1713 tcp_eat_recv_skb(sk, skb);
1714 if (!desc->count)
1715 break;
1716 WRITE_ONCE(tp->copied_seq, seq);
1717 }
1718 WRITE_ONCE(tp->copied_seq, seq);
1719
1720 tcp_rcv_space_adjust(sk);
1721
1722
1723 if (copied > 0) {
1724 tcp_recv_skb(sk, seq, &offset);
1725 tcp_cleanup_rbuf(sk, copied);
1726 }
1727 return copied;
1728}
1729EXPORT_SYMBOL(tcp_read_sock);
1730
1731int tcp_peek_len(struct socket *sock)
1732{
1733 return tcp_inq(sock->sk);
1734}
1735EXPORT_SYMBOL(tcp_peek_len);
1736
1737
1738int tcp_set_rcvlowat(struct sock *sk, int val)
1739{
1740 int cap;
1741
1742 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1743 cap = sk->sk_rcvbuf >> 1;
1744 else
1745 cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1;
1746 val = min(val, cap);
1747 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1748
1749
1750 tcp_data_ready(sk);
1751
1752 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1753 return 0;
1754
1755 val <<= 1;
1756 if (val > sk->sk_rcvbuf) {
1757 WRITE_ONCE(sk->sk_rcvbuf, val);
1758 tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
1759 }
1760 return 0;
1761}
1762EXPORT_SYMBOL(tcp_set_rcvlowat);
1763
1764void tcp_update_recv_tstamps(struct sk_buff *skb,
1765 struct scm_timestamping_internal *tss)
1766{
1767 if (skb->tstamp)
1768 tss->ts[0] = ktime_to_timespec64(skb->tstamp);
1769 else
1770 tss->ts[0] = (struct timespec64) {0};
1771
1772 if (skb_hwtstamps(skb)->hwtstamp)
1773 tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
1774 else
1775 tss->ts[2] = (struct timespec64) {0};
1776}
1777
1778#ifdef CONFIG_MMU
1779static const struct vm_operations_struct tcp_vm_ops = {
1780};
1781
1782int tcp_mmap(struct file *file, struct socket *sock,
1783 struct vm_area_struct *vma)
1784{
1785 if (vma->vm_flags & (VM_WRITE | VM_EXEC))
1786 return -EPERM;
1787 vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
1788
1789
1790 vma->vm_flags |= VM_MIXEDMAP;
1791
1792 vma->vm_ops = &tcp_vm_ops;
1793 return 0;
1794}
1795EXPORT_SYMBOL(tcp_mmap);
1796
1797static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb,
1798 u32 *offset_frag)
1799{
1800 skb_frag_t *frag;
1801
1802 if (unlikely(offset_skb >= skb->len))
1803 return NULL;
1804
1805 offset_skb -= skb_headlen(skb);
1806 if ((int)offset_skb < 0 || skb_has_frag_list(skb))
1807 return NULL;
1808
1809 frag = skb_shinfo(skb)->frags;
1810 while (offset_skb) {
1811 if (skb_frag_size(frag) > offset_skb) {
1812 *offset_frag = offset_skb;
1813 return frag;
1814 }
1815 offset_skb -= skb_frag_size(frag);
1816 ++frag;
1817 }
1818 *offset_frag = 0;
1819 return frag;
1820}
1821
1822static bool can_map_frag(const skb_frag_t *frag)
1823{
1824 return skb_frag_size(frag) == PAGE_SIZE && !skb_frag_off(frag);
1825}
1826
1827static int find_next_mappable_frag(const skb_frag_t *frag,
1828 int remaining_in_skb)
1829{
1830 int offset = 0;
1831
1832 if (likely(can_map_frag(frag)))
1833 return 0;
1834
1835 while (offset < remaining_in_skb && !can_map_frag(frag)) {
1836 offset += skb_frag_size(frag);
1837 ++frag;
1838 }
1839 return offset;
1840}
1841
1842static void tcp_zerocopy_set_hint_for_skb(struct sock *sk,
1843 struct tcp_zerocopy_receive *zc,
1844 struct sk_buff *skb, u32 offset)
1845{
1846 u32 frag_offset, partial_frag_remainder = 0;
1847 int mappable_offset;
1848 skb_frag_t *frag;
1849
1850
1851 zc->recv_skip_hint = skb->len - offset;
1852
1853
1854 frag = skb_advance_to_frag(skb, offset, &frag_offset);
1855 if (!frag)
1856 return;
1857
1858 if (frag_offset) {
1859 struct skb_shared_info *info = skb_shinfo(skb);
1860
1861
1862 if (frag == &info->frags[info->nr_frags - 1])
1863 return;
1864
1865
1866 partial_frag_remainder = skb_frag_size(frag) - frag_offset;
1867 zc->recv_skip_hint -= partial_frag_remainder;
1868 ++frag;
1869 }
1870
1871
1872
1873
1874
1875 mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint);
1876 zc->recv_skip_hint = mappable_offset + partial_frag_remainder;
1877}
1878
1879static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
1880 int nonblock, int flags,
1881 struct scm_timestamping_internal *tss,
1882 int *cmsg_flags);
1883static int receive_fallback_to_copy(struct sock *sk,
1884 struct tcp_zerocopy_receive *zc, int inq,
1885 struct scm_timestamping_internal *tss)
1886{
1887 unsigned long copy_address = (unsigned long)zc->copybuf_address;
1888 struct msghdr msg = {};
1889 struct iovec iov;
1890 int err;
1891
1892 zc->length = 0;
1893 zc->recv_skip_hint = 0;
1894
1895 if (copy_address != zc->copybuf_address)
1896 return -EINVAL;
1897
1898 err = import_single_range(READ, (void __user *)copy_address,
1899 inq, &iov, &msg.msg_iter);
1900 if (err)
1901 return err;
1902
1903 err = tcp_recvmsg_locked(sk, &msg, inq, 1, 0,
1904 tss, &zc->msg_flags);
1905 if (err < 0)
1906 return err;
1907
1908 zc->copybuf_len = err;
1909 if (likely(zc->copybuf_len)) {
1910 struct sk_buff *skb;
1911 u32 offset;
1912
1913 skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset);
1914 if (skb)
1915 tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset);
1916 }
1917 return 0;
1918}
1919
1920static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
1921 struct sk_buff *skb, u32 copylen,
1922 u32 *offset, u32 *seq)
1923{
1924 unsigned long copy_address = (unsigned long)zc->copybuf_address;
1925 struct msghdr msg = {};
1926 struct iovec iov;
1927 int err;
1928
1929 if (copy_address != zc->copybuf_address)
1930 return -EINVAL;
1931
1932 err = import_single_range(READ, (void __user *)copy_address,
1933 copylen, &iov, &msg.msg_iter);
1934 if (err)
1935 return err;
1936 err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
1937 if (err)
1938 return err;
1939 zc->recv_skip_hint -= copylen;
1940 *offset += copylen;
1941 *seq += copylen;
1942 return (__s32)copylen;
1943}
1944
1945static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc,
1946 struct sock *sk,
1947 struct sk_buff *skb,
1948 u32 *seq,
1949 s32 copybuf_len,
1950 struct scm_timestamping_internal *tss)
1951{
1952 u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
1953
1954 if (!copylen)
1955 return 0;
1956
1957 if (skb) {
1958 offset = *seq - TCP_SKB_CB(skb)->seq;
1959 } else {
1960 skb = tcp_recv_skb(sk, *seq, &offset);
1961 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1962 tcp_update_recv_tstamps(skb, tss);
1963 zc->msg_flags |= TCP_CMSG_TS;
1964 }
1965 }
1966
1967 zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
1968 seq);
1969 return zc->copybuf_len < 0 ? 0 : copylen;
1970}
1971
1972static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
1973 struct page **pending_pages,
1974 unsigned long pages_remaining,
1975 unsigned long *address,
1976 u32 *length,
1977 u32 *seq,
1978 struct tcp_zerocopy_receive *zc,
1979 u32 total_bytes_to_map,
1980 int err)
1981{
1982
1983 if (err == -EBUSY &&
1984 zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) {
1985 u32 maybe_zap_len;
1986
1987 maybe_zap_len = total_bytes_to_map -
1988 *length +
1989 (pages_remaining * PAGE_SIZE);
1990 zap_page_range(vma, *address, maybe_zap_len);
1991 err = 0;
1992 }
1993
1994 if (!err) {
1995 unsigned long leftover_pages = pages_remaining;
1996 int bytes_mapped;
1997
1998
1999 err = vm_insert_pages(vma, *address,
2000 pending_pages,
2001 &pages_remaining);
2002 bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining);
2003 *seq += bytes_mapped;
2004 *address += bytes_mapped;
2005 }
2006 if (err) {
2007
2008
2009
2010
2011
2012 const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
2013
2014 *length -= bytes_not_mapped;
2015 zc->recv_skip_hint += bytes_not_mapped;
2016 }
2017 return err;
2018}
2019
2020static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
2021 struct page **pages,
2022 unsigned int pages_to_map,
2023 unsigned long *address,
2024 u32 *length,
2025 u32 *seq,
2026 struct tcp_zerocopy_receive *zc,
2027 u32 total_bytes_to_map)
2028{
2029 unsigned long pages_remaining = pages_to_map;
2030 unsigned int pages_mapped;
2031 unsigned int bytes_mapped;
2032 int err;
2033
2034 err = vm_insert_pages(vma, *address, pages, &pages_remaining);
2035 pages_mapped = pages_to_map - (unsigned int)pages_remaining;
2036 bytes_mapped = PAGE_SIZE * pages_mapped;
2037
2038
2039
2040 *seq += bytes_mapped;
2041 *address += bytes_mapped;
2042
2043 if (likely(!err))
2044 return 0;
2045
2046
2047 return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped,
2048 pages_remaining, address, length, seq, zc, total_bytes_to_map,
2049 err);
2050}
2051
2052#define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS)
2053static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
2054 struct tcp_zerocopy_receive *zc,
2055 struct scm_timestamping_internal *tss)
2056{
2057 unsigned long msg_control_addr;
2058 struct msghdr cmsg_dummy;
2059
2060 msg_control_addr = (unsigned long)zc->msg_control;
2061 cmsg_dummy.msg_control = (void *)msg_control_addr;
2062 cmsg_dummy.msg_controllen =
2063 (__kernel_size_t)zc->msg_controllen;
2064 cmsg_dummy.msg_flags = in_compat_syscall()
2065 ? MSG_CMSG_COMPAT : 0;
2066 cmsg_dummy.msg_control_is_user = true;
2067 zc->msg_flags = 0;
2068 if (zc->msg_control == msg_control_addr &&
2069 zc->msg_controllen == cmsg_dummy.msg_controllen) {
2070 tcp_recv_timestamp(&cmsg_dummy, sk, tss);
2071 zc->msg_control = (__u64)
2072 ((uintptr_t)cmsg_dummy.msg_control);
2073 zc->msg_controllen =
2074 (__u64)cmsg_dummy.msg_controllen;
2075 zc->msg_flags = (__u32)cmsg_dummy.msg_flags;
2076 }
2077}
2078
2079#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
2080static int tcp_zerocopy_receive(struct sock *sk,
2081 struct tcp_zerocopy_receive *zc,
2082 struct scm_timestamping_internal *tss)
2083{
2084 u32 length = 0, offset, vma_len, avail_len, copylen = 0;
2085 unsigned long address = (unsigned long)zc->address;
2086 struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE];
2087 s32 copybuf_len = zc->copybuf_len;
2088 struct tcp_sock *tp = tcp_sk(sk);
2089 const skb_frag_t *frags = NULL;
2090 unsigned int pages_to_map = 0;
2091 struct vm_area_struct *vma;
2092 struct sk_buff *skb = NULL;
2093 u32 seq = tp->copied_seq;
2094 u32 total_bytes_to_map;
2095 int inq = tcp_inq(sk);
2096 int ret;
2097
2098 zc->copybuf_len = 0;
2099 zc->msg_flags = 0;
2100
2101 if (address & (PAGE_SIZE - 1) || address != zc->address)
2102 return -EINVAL;
2103
2104 if (sk->sk_state == TCP_LISTEN)
2105 return -ENOTCONN;
2106
2107 sock_rps_record_flow(sk);
2108
2109 if (inq && inq <= copybuf_len)
2110 return receive_fallback_to_copy(sk, zc, inq, tss);
2111
2112 if (inq < PAGE_SIZE) {
2113 zc->length = 0;
2114 zc->recv_skip_hint = inq;
2115 if (!inq && sock_flag(sk, SOCK_DONE))
2116 return -EIO;
2117 return 0;
2118 }
2119
2120 mmap_read_lock(current->mm);
2121
2122 vma = vma_lookup(current->mm, address);
2123 if (!vma || vma->vm_ops != &tcp_vm_ops) {
2124 mmap_read_unlock(current->mm);
2125 return -EINVAL;
2126 }
2127 vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
2128 avail_len = min_t(u32, vma_len, inq);
2129 total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1);
2130 if (total_bytes_to_map) {
2131 if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT))
2132 zap_page_range(vma, address, total_bytes_to_map);
2133 zc->length = total_bytes_to_map;
2134 zc->recv_skip_hint = 0;
2135 } else {
2136 zc->length = avail_len;
2137 zc->recv_skip_hint = avail_len;
2138 }
2139 ret = 0;
2140 while (length + PAGE_SIZE <= zc->length) {
2141 int mappable_offset;
2142 struct page *page;
2143
2144 if (zc->recv_skip_hint < PAGE_SIZE) {
2145 u32 offset_frag;
2146
2147 if (skb) {
2148 if (zc->recv_skip_hint > 0)
2149 break;
2150 skb = skb->next;
2151 offset = seq - TCP_SKB_CB(skb)->seq;
2152 } else {
2153 skb = tcp_recv_skb(sk, seq, &offset);
2154 }
2155
2156 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2157 tcp_update_recv_tstamps(skb, tss);
2158 zc->msg_flags |= TCP_CMSG_TS;
2159 }
2160 zc->recv_skip_hint = skb->len - offset;
2161 frags = skb_advance_to_frag(skb, offset, &offset_frag);
2162 if (!frags || offset_frag)
2163 break;
2164 }
2165
2166 mappable_offset = find_next_mappable_frag(frags,
2167 zc->recv_skip_hint);
2168 if (mappable_offset) {
2169 zc->recv_skip_hint = mappable_offset;
2170 break;
2171 }
2172 page = skb_frag_page(frags);
2173 prefetchw(page);
2174 pages[pages_to_map++] = page;
2175 length += PAGE_SIZE;
2176 zc->recv_skip_hint -= PAGE_SIZE;
2177 frags++;
2178 if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE ||
2179 zc->recv_skip_hint < PAGE_SIZE) {
2180
2181
2182
2183 ret = tcp_zerocopy_vm_insert_batch(vma, pages,
2184 pages_to_map,
2185 &address, &length,
2186 &seq, zc,
2187 total_bytes_to_map);
2188 if (ret)
2189 goto out;
2190 pages_to_map = 0;
2191 }
2192 }
2193 if (pages_to_map) {
2194 ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map,
2195 &address, &length, &seq,
2196 zc, total_bytes_to_map);
2197 }
2198out:
2199 mmap_read_unlock(current->mm);
2200
2201 if (!ret)
2202 copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss);
2203
2204 if (length + copylen) {
2205 WRITE_ONCE(tp->copied_seq, seq);
2206 tcp_rcv_space_adjust(sk);
2207
2208
2209 tcp_recv_skb(sk, seq, &offset);
2210 tcp_cleanup_rbuf(sk, length + copylen);
2211 ret = 0;
2212 if (length == zc->length)
2213 zc->recv_skip_hint = 0;
2214 } else {
2215 if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
2216 ret = -EIO;
2217 }
2218 zc->length = length;
2219 return ret;
2220}
2221#endif
2222
2223
2224void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
2225 struct scm_timestamping_internal *tss)
2226{
2227 int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
2228 bool has_timestamping = false;
2229
2230 if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
2231 if (sock_flag(sk, SOCK_RCVTSTAMP)) {
2232 if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
2233 if (new_tstamp) {
2234 struct __kernel_timespec kts = {
2235 .tv_sec = tss->ts[0].tv_sec,
2236 .tv_nsec = tss->ts[0].tv_nsec,
2237 };
2238 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
2239 sizeof(kts), &kts);
2240 } else {
2241 struct __kernel_old_timespec ts_old = {
2242 .tv_sec = tss->ts[0].tv_sec,
2243 .tv_nsec = tss->ts[0].tv_nsec,
2244 };
2245 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
2246 sizeof(ts_old), &ts_old);
2247 }
2248 } else {
2249 if (new_tstamp) {
2250 struct __kernel_sock_timeval stv = {
2251 .tv_sec = tss->ts[0].tv_sec,
2252 .tv_usec = tss->ts[0].tv_nsec / 1000,
2253 };
2254 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
2255 sizeof(stv), &stv);
2256 } else {
2257 struct __kernel_old_timeval tv = {
2258 .tv_sec = tss->ts[0].tv_sec,
2259 .tv_usec = tss->ts[0].tv_nsec / 1000,
2260 };
2261 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
2262 sizeof(tv), &tv);
2263 }
2264 }
2265 }
2266
2267 if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
2268 has_timestamping = true;
2269 else
2270 tss->ts[0] = (struct timespec64) {0};
2271 }
2272
2273 if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
2274 if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
2275 has_timestamping = true;
2276 else
2277 tss->ts[2] = (struct timespec64) {0};
2278 }
2279
2280 if (has_timestamping) {
2281 tss->ts[1] = (struct timespec64) {0};
2282 if (sock_flag(sk, SOCK_TSTAMP_NEW))
2283 put_cmsg_scm_timestamping64(msg, tss);
2284 else
2285 put_cmsg_scm_timestamping(msg, tss);
2286 }
2287}
2288
2289static int tcp_inq_hint(struct sock *sk)
2290{
2291 const struct tcp_sock *tp = tcp_sk(sk);
2292 u32 copied_seq = READ_ONCE(tp->copied_seq);
2293 u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
2294 int inq;
2295
2296 inq = rcv_nxt - copied_seq;
2297 if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
2298 lock_sock(sk);
2299 inq = tp->rcv_nxt - tp->copied_seq;
2300 release_sock(sk);
2301 }
2302
2303
2304
2305 if (inq == 0 && sock_flag(sk, SOCK_DONE))
2306 inq = 1;
2307 return inq;
2308}
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
2319 int nonblock, int flags,
2320 struct scm_timestamping_internal *tss,
2321 int *cmsg_flags)
2322{
2323 struct tcp_sock *tp = tcp_sk(sk);
2324 int copied = 0;
2325 u32 peek_seq;
2326 u32 *seq;
2327 unsigned long used;
2328 int err;
2329 int target;
2330 long timeo;
2331 struct sk_buff *skb, *last;
2332 u32 urg_hole = 0;
2333
2334 err = -ENOTCONN;
2335 if (sk->sk_state == TCP_LISTEN)
2336 goto out;
2337
2338 if (tp->recvmsg_inq)
2339 *cmsg_flags = TCP_CMSG_INQ;
2340 timeo = sock_rcvtimeo(sk, nonblock);
2341
2342
2343 if (flags & MSG_OOB)
2344 goto recv_urg;
2345
2346 if (unlikely(tp->repair)) {
2347 err = -EPERM;
2348 if (!(flags & MSG_PEEK))
2349 goto out;
2350
2351 if (tp->repair_queue == TCP_SEND_QUEUE)
2352 goto recv_sndq;
2353
2354 err = -EINVAL;
2355 if (tp->repair_queue == TCP_NO_QUEUE)
2356 goto out;
2357
2358
2359 }
2360
2361 seq = &tp->copied_seq;
2362 if (flags & MSG_PEEK) {
2363 peek_seq = tp->copied_seq;
2364 seq = &peek_seq;
2365 }
2366
2367 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
2368
2369 do {
2370 u32 offset;
2371
2372
2373 if (unlikely(tp->urg_data) && tp->urg_seq == *seq) {
2374 if (copied)
2375 break;
2376 if (signal_pending(current)) {
2377 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
2378 break;
2379 }
2380 }
2381
2382
2383
2384 last = skb_peek_tail(&sk->sk_receive_queue);
2385 skb_queue_walk(&sk->sk_receive_queue, skb) {
2386 last = skb;
2387
2388
2389
2390 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
2391 "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
2392 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
2393 flags))
2394 break;
2395
2396 offset = *seq - TCP_SKB_CB(skb)->seq;
2397 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
2398 pr_err_once("%s: found a SYN, please report !\n", __func__);
2399 offset--;
2400 }
2401 if (offset < skb->len)
2402 goto found_ok_skb;
2403 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2404 goto found_fin_ok;
2405 WARN(!(flags & MSG_PEEK),
2406 "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
2407 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
2408 }
2409
2410
2411
2412 if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
2413 break;
2414
2415 if (copied) {
2416 if (!timeo ||
2417 sk->sk_err ||
2418 sk->sk_state == TCP_CLOSE ||
2419 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2420 signal_pending(current))
2421 break;
2422 } else {
2423 if (sock_flag(sk, SOCK_DONE))
2424 break;
2425
2426 if (sk->sk_err) {
2427 copied = sock_error(sk);
2428 break;
2429 }
2430
2431 if (sk->sk_shutdown & RCV_SHUTDOWN)
2432 break;
2433
2434 if (sk->sk_state == TCP_CLOSE) {
2435
2436
2437
2438 copied = -ENOTCONN;
2439 break;
2440 }
2441
2442 if (!timeo) {
2443 copied = -EAGAIN;
2444 break;
2445 }
2446
2447 if (signal_pending(current)) {
2448 copied = sock_intr_errno(timeo);
2449 break;
2450 }
2451 }
2452
2453 if (copied >= target) {
2454
2455 __sk_flush_backlog(sk);
2456 } else {
2457 tcp_cleanup_rbuf(sk, copied);
2458 sk_defer_free_flush(sk);
2459 sk_wait_data(sk, &timeo, last);
2460 }
2461
2462 if ((flags & MSG_PEEK) &&
2463 (peek_seq - copied - urg_hole != tp->copied_seq)) {
2464 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
2465 current->comm,
2466 task_pid_nr(current));
2467 peek_seq = tp->copied_seq;
2468 }
2469 continue;
2470
2471found_ok_skb:
2472
2473 used = skb->len - offset;
2474 if (len < used)
2475 used = len;
2476
2477
2478 if (unlikely(tp->urg_data)) {
2479 u32 urg_offset = tp->urg_seq - *seq;
2480 if (urg_offset < used) {
2481 if (!urg_offset) {
2482 if (!sock_flag(sk, SOCK_URGINLINE)) {
2483 WRITE_ONCE(*seq, *seq + 1);
2484 urg_hole++;
2485 offset++;
2486 used--;
2487 if (!used)
2488 goto skip_copy;
2489 }
2490 } else
2491 used = urg_offset;
2492 }
2493 }
2494
2495 if (!(flags & MSG_TRUNC)) {
2496 err = skb_copy_datagram_msg(skb, offset, msg, used);
2497 if (err) {
2498
2499 if (!copied)
2500 copied = -EFAULT;
2501 break;
2502 }
2503 }
2504
2505 WRITE_ONCE(*seq, *seq + used);
2506 copied += used;
2507 len -= used;
2508
2509 tcp_rcv_space_adjust(sk);
2510
2511skip_copy:
2512 if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) {
2513 WRITE_ONCE(tp->urg_data, 0);
2514 tcp_fast_path_check(sk);
2515 }
2516
2517 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2518 tcp_update_recv_tstamps(skb, tss);
2519 *cmsg_flags |= TCP_CMSG_TS;
2520 }
2521
2522 if (used + offset < skb->len)
2523 continue;
2524
2525 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2526 goto found_fin_ok;
2527 if (!(flags & MSG_PEEK))
2528 tcp_eat_recv_skb(sk, skb);
2529 continue;
2530
2531found_fin_ok:
2532
2533 WRITE_ONCE(*seq, *seq + 1);
2534 if (!(flags & MSG_PEEK))
2535 tcp_eat_recv_skb(sk, skb);
2536 break;
2537 } while (len > 0);
2538
2539
2540
2541
2542
2543
2544 tcp_cleanup_rbuf(sk, copied);
2545 return copied;
2546
2547out:
2548 return err;
2549
2550recv_urg:
2551 err = tcp_recv_urg(sk, msg, len, flags);
2552 goto out;
2553
2554recv_sndq:
2555 err = tcp_peek_sndq(sk, msg, len);
2556 goto out;
2557}
2558
2559int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
2560 int flags, int *addr_len)
2561{
2562 int cmsg_flags = 0, ret, inq;
2563 struct scm_timestamping_internal tss;
2564
2565 if (unlikely(flags & MSG_ERRQUEUE))
2566 return inet_recv_error(sk, msg, len, addr_len);
2567
2568 if (sk_can_busy_loop(sk) &&
2569 skb_queue_empty_lockless(&sk->sk_receive_queue) &&
2570 sk->sk_state == TCP_ESTABLISHED)
2571 sk_busy_loop(sk, nonblock);
2572
2573 lock_sock(sk);
2574 ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss,
2575 &cmsg_flags);
2576 release_sock(sk);
2577 sk_defer_free_flush(sk);
2578
2579 if (cmsg_flags && ret >= 0) {
2580 if (cmsg_flags & TCP_CMSG_TS)
2581 tcp_recv_timestamp(msg, sk, &tss);
2582 if (cmsg_flags & TCP_CMSG_INQ) {
2583 inq = tcp_inq_hint(sk);
2584 put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
2585 }
2586 }
2587 return ret;
2588}
2589EXPORT_SYMBOL(tcp_recvmsg);
2590
2591void tcp_set_state(struct sock *sk, int state)
2592{
2593 int oldstate = sk->sk_state;
2594
2595
2596
2597
2598
2599
2600
2601
2602 BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
2603 BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
2604 BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
2605 BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
2606 BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
2607 BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
2608 BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
2609 BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
2610 BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
2611 BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
2612 BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
2613 BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
2614 BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625 BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED);
2626
2627 if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
2628 tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
2629
2630 switch (state) {
2631 case TCP_ESTABLISHED:
2632 if (oldstate != TCP_ESTABLISHED)
2633 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2634 break;
2635
2636 case TCP_CLOSE:
2637 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
2638 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
2639
2640 sk->sk_prot->unhash(sk);
2641 if (inet_csk(sk)->icsk_bind_hash &&
2642 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
2643 inet_put_port(sk);
2644 fallthrough;
2645 default:
2646 if (oldstate == TCP_ESTABLISHED)
2647 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2648 }
2649
2650
2651
2652
2653 inet_sk_state_store(sk, state);
2654}
2655EXPORT_SYMBOL_GPL(tcp_set_state);
2656
2657
2658
2659
2660
2661
2662
2663
2664static const unsigned char new_state[16] = {
2665
2666 [0 ] = TCP_CLOSE,
2667 [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2668 [TCP_SYN_SENT] = TCP_CLOSE,
2669 [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2670 [TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
2671 [TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
2672 [TCP_TIME_WAIT] = TCP_CLOSE,
2673 [TCP_CLOSE] = TCP_CLOSE,
2674 [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
2675 [TCP_LAST_ACK] = TCP_LAST_ACK,
2676 [TCP_LISTEN] = TCP_CLOSE,
2677 [TCP_CLOSING] = TCP_CLOSING,
2678 [TCP_NEW_SYN_RECV] = TCP_CLOSE,
2679};
2680
2681static int tcp_close_state(struct sock *sk)
2682{
2683 int next = (int)new_state[sk->sk_state];
2684 int ns = next & TCP_STATE_MASK;
2685
2686 tcp_set_state(sk, ns);
2687
2688 return next & TCP_ACTION_FIN;
2689}
2690
2691
2692
2693
2694
2695
2696void tcp_shutdown(struct sock *sk, int how)
2697{
2698
2699
2700
2701
2702 if (!(how & SEND_SHUTDOWN))
2703 return;
2704
2705
2706 if ((1 << sk->sk_state) &
2707 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2708 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2709
2710 if (tcp_close_state(sk))
2711 tcp_send_fin(sk);
2712 }
2713}
2714EXPORT_SYMBOL(tcp_shutdown);
2715
2716int tcp_orphan_count_sum(void)
2717{
2718 int i, total = 0;
2719
2720 for_each_possible_cpu(i)
2721 total += per_cpu(tcp_orphan_count, i);
2722
2723 return max(total, 0);
2724}
2725
2726static int tcp_orphan_cache;
2727static struct timer_list tcp_orphan_timer;
2728#define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100)
2729
2730static void tcp_orphan_update(struct timer_list *unused)
2731{
2732 WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum());
2733 mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
2734}
2735
2736static bool tcp_too_many_orphans(int shift)
2737{
2738 return READ_ONCE(tcp_orphan_cache) << shift > sysctl_tcp_max_orphans;
2739}
2740
2741bool tcp_check_oom(struct sock *sk, int shift)
2742{
2743 bool too_many_orphans, out_of_socket_memory;
2744
2745 too_many_orphans = tcp_too_many_orphans(shift);
2746 out_of_socket_memory = tcp_out_of_memory(sk);
2747
2748 if (too_many_orphans)
2749 net_info_ratelimited("too many orphaned sockets\n");
2750 if (out_of_socket_memory)
2751 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2752 return too_many_orphans || out_of_socket_memory;
2753}
2754
2755void __tcp_close(struct sock *sk, long timeout)
2756{
2757 struct sk_buff *skb;
2758 int data_was_unread = 0;
2759 int state;
2760
2761 sk->sk_shutdown = SHUTDOWN_MASK;
2762
2763 if (sk->sk_state == TCP_LISTEN) {
2764 tcp_set_state(sk, TCP_CLOSE);
2765
2766
2767 inet_csk_listen_stop(sk);
2768
2769 goto adjudge_to_death;
2770 }
2771
2772
2773
2774
2775
2776 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2777 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
2778
2779 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2780 len--;
2781 data_was_unread += len;
2782 __kfree_skb(skb);
2783 }
2784
2785 sk_mem_reclaim(sk);
2786
2787
2788 if (sk->sk_state == TCP_CLOSE)
2789 goto adjudge_to_death;
2790
2791
2792
2793
2794
2795
2796
2797
2798 if (unlikely(tcp_sk(sk)->repair)) {
2799 sk->sk_prot->disconnect(sk, 0);
2800 } else if (data_was_unread) {
2801
2802 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2803 tcp_set_state(sk, TCP_CLOSE);
2804 tcp_send_active_reset(sk, sk->sk_allocation);
2805 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2806
2807 sk->sk_prot->disconnect(sk, 0);
2808 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2809 } else if (tcp_close_state(sk)) {
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839 tcp_send_fin(sk);
2840 }
2841
2842 sk_stream_wait_close(sk, timeout);
2843
2844adjudge_to_death:
2845 state = sk->sk_state;
2846 sock_hold(sk);
2847 sock_orphan(sk);
2848
2849 local_bh_disable();
2850 bh_lock_sock(sk);
2851
2852 __release_sock(sk);
2853
2854 this_cpu_inc(tcp_orphan_count);
2855
2856
2857 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2858 goto out;
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874 if (sk->sk_state == TCP_FIN_WAIT2) {
2875 struct tcp_sock *tp = tcp_sk(sk);
2876 if (tp->linger2 < 0) {
2877 tcp_set_state(sk, TCP_CLOSE);
2878 tcp_send_active_reset(sk, GFP_ATOMIC);
2879 __NET_INC_STATS(sock_net(sk),
2880 LINUX_MIB_TCPABORTONLINGER);
2881 } else {
2882 const int tmo = tcp_fin_time(sk);
2883
2884 if (tmo > TCP_TIMEWAIT_LEN) {
2885 inet_csk_reset_keepalive_timer(sk,
2886 tmo - TCP_TIMEWAIT_LEN);
2887 } else {
2888 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2889 goto out;
2890 }
2891 }
2892 }
2893 if (sk->sk_state != TCP_CLOSE) {
2894 sk_mem_reclaim(sk);
2895 if (tcp_check_oom(sk, 0)) {
2896 tcp_set_state(sk, TCP_CLOSE);
2897 tcp_send_active_reset(sk, GFP_ATOMIC);
2898 __NET_INC_STATS(sock_net(sk),
2899 LINUX_MIB_TCPABORTONMEMORY);
2900 } else if (!check_net(sock_net(sk))) {
2901
2902 tcp_set_state(sk, TCP_CLOSE);
2903 }
2904 }
2905
2906 if (sk->sk_state == TCP_CLOSE) {
2907 struct request_sock *req;
2908
2909 req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
2910 lockdep_sock_is_held(sk));
2911
2912
2913
2914
2915 if (req)
2916 reqsk_fastopen_remove(sk, req, false);
2917 inet_csk_destroy_sock(sk);
2918 }
2919
2920
2921out:
2922 bh_unlock_sock(sk);
2923 local_bh_enable();
2924}
2925
2926void tcp_close(struct sock *sk, long timeout)
2927{
2928 lock_sock(sk);
2929 __tcp_close(sk, timeout);
2930 release_sock(sk);
2931 sock_put(sk);
2932}
2933EXPORT_SYMBOL(tcp_close);
2934
2935
2936
2937static inline bool tcp_need_reset(int state)
2938{
2939 return (1 << state) &
2940 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2941 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2942}
2943
2944static void tcp_rtx_queue_purge(struct sock *sk)
2945{
2946 struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
2947
2948 tcp_sk(sk)->highest_sack = NULL;
2949 while (p) {
2950 struct sk_buff *skb = rb_to_skb(p);
2951
2952 p = rb_next(p);
2953
2954
2955
2956 tcp_rtx_queue_unlink(skb, sk);
2957 tcp_wmem_free_skb(sk, skb);
2958 }
2959}
2960
2961void tcp_write_queue_purge(struct sock *sk)
2962{
2963 struct sk_buff *skb;
2964
2965 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
2966 while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
2967 tcp_skb_tsorted_anchor_cleanup(skb);
2968 tcp_wmem_free_skb(sk, skb);
2969 }
2970 tcp_rtx_queue_purge(sk);
2971 INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
2972 sk_mem_reclaim(sk);
2973 tcp_clear_all_retrans_hints(tcp_sk(sk));
2974 tcp_sk(sk)->packets_out = 0;
2975 inet_csk(sk)->icsk_backoff = 0;
2976}
2977
2978int tcp_disconnect(struct sock *sk, int flags)
2979{
2980 struct inet_sock *inet = inet_sk(sk);
2981 struct inet_connection_sock *icsk = inet_csk(sk);
2982 struct tcp_sock *tp = tcp_sk(sk);
2983 int old_state = sk->sk_state;
2984 u32 seq;
2985
2986 if (old_state != TCP_CLOSE)
2987 tcp_set_state(sk, TCP_CLOSE);
2988
2989
2990 if (old_state == TCP_LISTEN) {
2991 inet_csk_listen_stop(sk);
2992 } else if (unlikely(tp->repair)) {
2993 sk->sk_err = ECONNABORTED;
2994 } else if (tcp_need_reset(old_state) ||
2995 (tp->snd_nxt != tp->write_seq &&
2996 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2997
2998
2999
3000 tcp_send_active_reset(sk, gfp_any());
3001 sk->sk_err = ECONNRESET;
3002 } else if (old_state == TCP_SYN_SENT)
3003 sk->sk_err = ECONNRESET;
3004
3005 tcp_clear_xmit_timers(sk);
3006 __skb_queue_purge(&sk->sk_receive_queue);
3007 WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
3008 WRITE_ONCE(tp->urg_data, 0);
3009 tcp_write_queue_purge(sk);
3010 tcp_fastopen_active_disable_ofo_check(sk);
3011 skb_rbtree_purge(&tp->out_of_order_queue);
3012
3013 inet->inet_dport = 0;
3014
3015 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
3016 inet_reset_saddr(sk);
3017
3018 sk->sk_shutdown = 0;
3019 sock_reset_flag(sk, SOCK_DONE);
3020 tp->srtt_us = 0;
3021 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
3022 tp->rcv_rtt_last_tsecr = 0;
3023
3024 seq = tp->write_seq + tp->max_window + 2;
3025 if (!seq)
3026 seq = 1;
3027 WRITE_ONCE(tp->write_seq, seq);
3028
3029 icsk->icsk_backoff = 0;
3030 icsk->icsk_probes_out = 0;
3031 icsk->icsk_probes_tstamp = 0;
3032 icsk->icsk_rto = TCP_TIMEOUT_INIT;
3033 icsk->icsk_rto_min = TCP_RTO_MIN;
3034 icsk->icsk_delack_max = TCP_DELACK_MAX;
3035 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
3036 tp->snd_cwnd = TCP_INIT_CWND;
3037 tp->snd_cwnd_cnt = 0;
3038 tp->window_clamp = 0;
3039 tp->delivered = 0;
3040 tp->delivered_ce = 0;
3041 if (icsk->icsk_ca_ops->release)
3042 icsk->icsk_ca_ops->release(sk);
3043 memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
3044 icsk->icsk_ca_initialized = 0;
3045 tcp_set_ca_state(sk, TCP_CA_Open);
3046 tp->is_sack_reneg = 0;
3047 tcp_clear_retrans(tp);
3048 tp->total_retrans = 0;
3049 inet_csk_delack_init(sk);
3050
3051
3052
3053 icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
3054 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
3055 __sk_dst_reset(sk);
3056 dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL));
3057 tcp_saved_syn_free(tp);
3058 tp->compressed_ack = 0;
3059 tp->segs_in = 0;
3060 tp->segs_out = 0;
3061 tp->bytes_sent = 0;
3062 tp->bytes_acked = 0;
3063 tp->bytes_received = 0;
3064 tp->bytes_retrans = 0;
3065 tp->data_segs_in = 0;
3066 tp->data_segs_out = 0;
3067 tp->duplicate_sack[0].start_seq = 0;
3068 tp->duplicate_sack[0].end_seq = 0;
3069 tp->dsack_dups = 0;
3070 tp->reord_seen = 0;
3071 tp->retrans_out = 0;
3072 tp->sacked_out = 0;
3073 tp->tlp_high_seq = 0;
3074 tp->last_oow_ack_time = 0;
3075
3076 tp->app_limited = ~0U;
3077 tp->rack.mstamp = 0;
3078 tp->rack.advanced = 0;
3079 tp->rack.reo_wnd_steps = 1;
3080 tp->rack.last_delivered = 0;
3081 tp->rack.reo_wnd_persist = 0;
3082 tp->rack.dsack_seen = 0;
3083 tp->syn_data_acked = 0;
3084 tp->rx_opt.saw_tstamp = 0;
3085 tp->rx_opt.dsack = 0;
3086 tp->rx_opt.num_sacks = 0;
3087 tp->rcv_ooopack = 0;
3088
3089
3090
3091 tcp_free_fastopen_req(tp);
3092 inet->defer_connect = 0;
3093 tp->fastopen_client_fail = 0;
3094
3095 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
3096
3097 if (sk->sk_frag.page) {
3098 put_page(sk->sk_frag.page);
3099 sk->sk_frag.page = NULL;
3100 sk->sk_frag.offset = 0;
3101 }
3102 sk_defer_free_flush(sk);
3103 sk_error_report(sk);
3104 return 0;
3105}
3106EXPORT_SYMBOL(tcp_disconnect);
3107
3108static inline bool tcp_can_repair_sock(const struct sock *sk)
3109{
3110 return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
3111 (sk->sk_state != TCP_LISTEN);
3112}
3113
3114static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
3115{
3116 struct tcp_repair_window opt;
3117
3118 if (!tp->repair)
3119 return -EPERM;
3120
3121 if (len != sizeof(opt))
3122 return -EINVAL;
3123
3124 if (copy_from_sockptr(&opt, optbuf, sizeof(opt)))
3125 return -EFAULT;
3126
3127 if (opt.max_window < opt.snd_wnd)
3128 return -EINVAL;
3129
3130 if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
3131 return -EINVAL;
3132
3133 if (after(opt.rcv_wup, tp->rcv_nxt))
3134 return -EINVAL;
3135
3136 tp->snd_wl1 = opt.snd_wl1;
3137 tp->snd_wnd = opt.snd_wnd;
3138 tp->max_window = opt.max_window;
3139
3140 tp->rcv_wnd = opt.rcv_wnd;
3141 tp->rcv_wup = opt.rcv_wup;
3142
3143 return 0;
3144}
3145
3146static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
3147 unsigned int len)
3148{
3149 struct tcp_sock *tp = tcp_sk(sk);
3150 struct tcp_repair_opt opt;
3151 size_t offset = 0;
3152
3153 while (len >= sizeof(opt)) {
3154 if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt)))
3155 return -EFAULT;
3156
3157 offset += sizeof(opt);
3158 len -= sizeof(opt);
3159
3160 switch (opt.opt_code) {
3161 case TCPOPT_MSS:
3162 tp->rx_opt.mss_clamp = opt.opt_val;
3163 tcp_mtup_init(sk);
3164 break;
3165 case TCPOPT_WINDOW:
3166 {
3167 u16 snd_wscale = opt.opt_val & 0xFFFF;
3168 u16 rcv_wscale = opt.opt_val >> 16;
3169
3170 if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE)
3171 return -EFBIG;
3172
3173 tp->rx_opt.snd_wscale = snd_wscale;
3174 tp->rx_opt.rcv_wscale = rcv_wscale;
3175 tp->rx_opt.wscale_ok = 1;
3176 }
3177 break;
3178 case TCPOPT_SACK_PERM:
3179 if (opt.opt_val != 0)
3180 return -EINVAL;
3181
3182 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
3183 break;
3184 case TCPOPT_TIMESTAMP:
3185 if (opt.opt_val != 0)
3186 return -EINVAL;
3187
3188 tp->rx_opt.tstamp_ok = 1;
3189 break;
3190 }
3191 }
3192
3193 return 0;
3194}
3195
3196DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
3197EXPORT_SYMBOL(tcp_tx_delay_enabled);
3198
3199static void tcp_enable_tx_delay(void)
3200{
3201 if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
3202 static int __tcp_tx_delay_enabled = 0;
3203
3204 if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
3205 static_branch_enable(&tcp_tx_delay_enabled);
3206 pr_info("TCP_TX_DELAY enabled\n");
3207 }
3208 }
3209}
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220void __tcp_sock_set_cork(struct sock *sk, bool on)
3221{
3222 struct tcp_sock *tp = tcp_sk(sk);
3223
3224 if (on) {
3225 tp->nonagle |= TCP_NAGLE_CORK;
3226 } else {
3227 tp->nonagle &= ~TCP_NAGLE_CORK;
3228 if (tp->nonagle & TCP_NAGLE_OFF)
3229 tp->nonagle |= TCP_NAGLE_PUSH;
3230 tcp_push_pending_frames(sk);
3231 }
3232}
3233
3234void tcp_sock_set_cork(struct sock *sk, bool on)
3235{
3236 lock_sock(sk);
3237 __tcp_sock_set_cork(sk, on);
3238 release_sock(sk);
3239}
3240EXPORT_SYMBOL(tcp_sock_set_cork);
3241
3242
3243
3244
3245
3246
3247
3248void __tcp_sock_set_nodelay(struct sock *sk, bool on)
3249{
3250 if (on) {
3251 tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
3252 tcp_push_pending_frames(sk);
3253 } else {
3254 tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF;
3255 }
3256}
3257
3258void tcp_sock_set_nodelay(struct sock *sk)
3259{
3260 lock_sock(sk);
3261 __tcp_sock_set_nodelay(sk, true);
3262 release_sock(sk);
3263}
3264EXPORT_SYMBOL(tcp_sock_set_nodelay);
3265
3266static void __tcp_sock_set_quickack(struct sock *sk, int val)
3267{
3268 if (!val) {
3269 inet_csk_enter_pingpong_mode(sk);
3270 return;
3271 }
3272
3273 inet_csk_exit_pingpong_mode(sk);
3274 if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
3275 inet_csk_ack_scheduled(sk)) {
3276 inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED;
3277 tcp_cleanup_rbuf(sk, 1);
3278 if (!(val & 1))
3279 inet_csk_enter_pingpong_mode(sk);
3280 }
3281}
3282
3283void tcp_sock_set_quickack(struct sock *sk, int val)
3284{
3285 lock_sock(sk);
3286 __tcp_sock_set_quickack(sk, val);
3287 release_sock(sk);
3288}
3289EXPORT_SYMBOL(tcp_sock_set_quickack);
3290
3291int tcp_sock_set_syncnt(struct sock *sk, int val)
3292{
3293 if (val < 1 || val > MAX_TCP_SYNCNT)
3294 return -EINVAL;
3295
3296 lock_sock(sk);
3297 inet_csk(sk)->icsk_syn_retries = val;
3298 release_sock(sk);
3299 return 0;
3300}
3301EXPORT_SYMBOL(tcp_sock_set_syncnt);
3302
3303void tcp_sock_set_user_timeout(struct sock *sk, u32 val)
3304{
3305 lock_sock(sk);
3306 inet_csk(sk)->icsk_user_timeout = val;
3307 release_sock(sk);
3308}
3309EXPORT_SYMBOL(tcp_sock_set_user_timeout);
3310
3311int tcp_sock_set_keepidle_locked(struct sock *sk, int val)
3312{
3313 struct tcp_sock *tp = tcp_sk(sk);
3314
3315 if (val < 1 || val > MAX_TCP_KEEPIDLE)
3316 return -EINVAL;
3317
3318 tp->keepalive_time = val * HZ;
3319 if (sock_flag(sk, SOCK_KEEPOPEN) &&
3320 !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
3321 u32 elapsed = keepalive_time_elapsed(tp);
3322
3323 if (tp->keepalive_time > elapsed)
3324 elapsed = tp->keepalive_time - elapsed;
3325 else
3326 elapsed = 0;
3327 inet_csk_reset_keepalive_timer(sk, elapsed);
3328 }
3329
3330 return 0;
3331}
3332
3333int tcp_sock_set_keepidle(struct sock *sk, int val)
3334{
3335 int err;
3336
3337 lock_sock(sk);
3338 err = tcp_sock_set_keepidle_locked(sk, val);
3339 release_sock(sk);
3340 return err;
3341}
3342EXPORT_SYMBOL(tcp_sock_set_keepidle);
3343
3344int tcp_sock_set_keepintvl(struct sock *sk, int val)
3345{
3346 if (val < 1 || val > MAX_TCP_KEEPINTVL)
3347 return -EINVAL;
3348
3349 lock_sock(sk);
3350 tcp_sk(sk)->keepalive_intvl = val * HZ;
3351 release_sock(sk);
3352 return 0;
3353}
3354EXPORT_SYMBOL(tcp_sock_set_keepintvl);
3355
3356int tcp_sock_set_keepcnt(struct sock *sk, int val)
3357{
3358 if (val < 1 || val > MAX_TCP_KEEPCNT)
3359 return -EINVAL;
3360
3361 lock_sock(sk);
3362 tcp_sk(sk)->keepalive_probes = val;
3363 release_sock(sk);
3364 return 0;
3365}
3366EXPORT_SYMBOL(tcp_sock_set_keepcnt);
3367
3368int tcp_set_window_clamp(struct sock *sk, int val)
3369{
3370 struct tcp_sock *tp = tcp_sk(sk);
3371
3372 if (!val) {
3373 if (sk->sk_state != TCP_CLOSE)
3374 return -EINVAL;
3375 tp->window_clamp = 0;
3376 } else {
3377 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
3378 SOCK_MIN_RCVBUF / 2 : val;
3379 tp->rcv_ssthresh = min(tp->rcv_wnd, tp->window_clamp);
3380 }
3381 return 0;
3382}
3383
3384
3385
3386
3387static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
3388 sockptr_t optval, unsigned int optlen)
3389{
3390 struct tcp_sock *tp = tcp_sk(sk);
3391 struct inet_connection_sock *icsk = inet_csk(sk);
3392 struct net *net = sock_net(sk);
3393 int val;
3394 int err = 0;
3395
3396
3397 switch (optname) {
3398 case TCP_CONGESTION: {
3399 char name[TCP_CA_NAME_MAX];
3400
3401 if (optlen < 1)
3402 return -EINVAL;
3403
3404 val = strncpy_from_sockptr(name, optval,
3405 min_t(long, TCP_CA_NAME_MAX-1, optlen));
3406 if (val < 0)
3407 return -EFAULT;
3408 name[val] = 0;
3409
3410 lock_sock(sk);
3411 err = tcp_set_congestion_control(sk, name, true,
3412 ns_capable(sock_net(sk)->user_ns,
3413 CAP_NET_ADMIN));
3414 release_sock(sk);
3415 return err;
3416 }
3417 case TCP_ULP: {
3418 char name[TCP_ULP_NAME_MAX];
3419
3420 if (optlen < 1)
3421 return -EINVAL;
3422
3423 val = strncpy_from_sockptr(name, optval,
3424 min_t(long, TCP_ULP_NAME_MAX - 1,
3425 optlen));
3426 if (val < 0)
3427 return -EFAULT;
3428 name[val] = 0;
3429
3430 lock_sock(sk);
3431 err = tcp_set_ulp(sk, name);
3432 release_sock(sk);
3433 return err;
3434 }
3435 case TCP_FASTOPEN_KEY: {
3436 __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
3437 __u8 *backup_key = NULL;
3438
3439
3440
3441
3442 if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
3443 optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
3444 return -EINVAL;
3445
3446 if (copy_from_sockptr(key, optval, optlen))
3447 return -EFAULT;
3448
3449 if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
3450 backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
3451
3452 return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
3453 }
3454 default:
3455
3456 break;
3457 }
3458
3459 if (optlen < sizeof(int))
3460 return -EINVAL;
3461
3462 if (copy_from_sockptr(&val, optval, sizeof(val)))
3463 return -EFAULT;
3464
3465 lock_sock(sk);
3466
3467 switch (optname) {
3468 case TCP_MAXSEG:
3469
3470
3471
3472
3473 if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) {
3474 err = -EINVAL;
3475 break;
3476 }
3477 tp->rx_opt.user_mss = val;
3478 break;
3479
3480 case TCP_NODELAY:
3481 __tcp_sock_set_nodelay(sk, val);
3482 break;
3483
3484 case TCP_THIN_LINEAR_TIMEOUTS:
3485 if (val < 0 || val > 1)
3486 err = -EINVAL;
3487 else
3488 tp->thin_lto = val;
3489 break;
3490
3491 case TCP_THIN_DUPACK:
3492 if (val < 0 || val > 1)
3493 err = -EINVAL;
3494 break;
3495
3496 case TCP_REPAIR:
3497 if (!tcp_can_repair_sock(sk))
3498 err = -EPERM;
3499 else if (val == TCP_REPAIR_ON) {
3500 tp->repair = 1;
3501 sk->sk_reuse = SK_FORCE_REUSE;
3502 tp->repair_queue = TCP_NO_QUEUE;
3503 } else if (val == TCP_REPAIR_OFF) {
3504 tp->repair = 0;
3505 sk->sk_reuse = SK_NO_REUSE;
3506 tcp_send_window_probe(sk);
3507 } else if (val == TCP_REPAIR_OFF_NO_WP) {
3508 tp->repair = 0;
3509 sk->sk_reuse = SK_NO_REUSE;
3510 } else
3511 err = -EINVAL;
3512
3513 break;
3514
3515 case TCP_REPAIR_QUEUE:
3516 if (!tp->repair)
3517 err = -EPERM;
3518 else if ((unsigned int)val < TCP_QUEUES_NR)
3519 tp->repair_queue = val;
3520 else
3521 err = -EINVAL;
3522 break;
3523
3524 case TCP_QUEUE_SEQ:
3525 if (sk->sk_state != TCP_CLOSE) {
3526 err = -EPERM;
3527 } else if (tp->repair_queue == TCP_SEND_QUEUE) {
3528 if (!tcp_rtx_queue_empty(sk))
3529 err = -EPERM;
3530 else
3531 WRITE_ONCE(tp->write_seq, val);
3532 } else if (tp->repair_queue == TCP_RECV_QUEUE) {
3533 if (tp->rcv_nxt != tp->copied_seq) {
3534 err = -EPERM;
3535 } else {
3536 WRITE_ONCE(tp->rcv_nxt, val);
3537 WRITE_ONCE(tp->copied_seq, val);
3538 }
3539 } else {
3540 err = -EINVAL;
3541 }
3542 break;
3543
3544 case TCP_REPAIR_OPTIONS:
3545 if (!tp->repair)
3546 err = -EINVAL;
3547 else if (sk->sk_state == TCP_ESTABLISHED)
3548 err = tcp_repair_options_est(sk, optval, optlen);
3549 else
3550 err = -EPERM;
3551 break;
3552
3553 case TCP_CORK:
3554 __tcp_sock_set_cork(sk, val);
3555 break;
3556
3557 case TCP_KEEPIDLE:
3558 err = tcp_sock_set_keepidle_locked(sk, val);
3559 break;
3560 case TCP_KEEPINTVL:
3561 if (val < 1 || val > MAX_TCP_KEEPINTVL)
3562 err = -EINVAL;
3563 else
3564 tp->keepalive_intvl = val * HZ;
3565 break;
3566 case TCP_KEEPCNT:
3567 if (val < 1 || val > MAX_TCP_KEEPCNT)
3568 err = -EINVAL;
3569 else
3570 tp->keepalive_probes = val;
3571 break;
3572 case TCP_SYNCNT:
3573 if (val < 1 || val > MAX_TCP_SYNCNT)
3574 err = -EINVAL;
3575 else
3576 icsk->icsk_syn_retries = val;
3577 break;
3578
3579 case TCP_SAVE_SYN:
3580
3581 if (val < 0 || val > 2)
3582 err = -EINVAL;
3583 else
3584 tp->save_syn = val;
3585 break;
3586
3587 case TCP_LINGER2:
3588 if (val < 0)
3589 tp->linger2 = -1;
3590 else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
3591 tp->linger2 = TCP_FIN_TIMEOUT_MAX;
3592 else
3593 tp->linger2 = val * HZ;
3594 break;
3595
3596 case TCP_DEFER_ACCEPT:
3597
3598 icsk->icsk_accept_queue.rskq_defer_accept =
3599 secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
3600 TCP_RTO_MAX / HZ);
3601 break;
3602
3603 case TCP_WINDOW_CLAMP:
3604 err = tcp_set_window_clamp(sk, val);
3605 break;
3606
3607 case TCP_QUICKACK:
3608 __tcp_sock_set_quickack(sk, val);
3609 break;
3610
3611#ifdef CONFIG_TCP_MD5SIG
3612 case TCP_MD5SIG:
3613 case TCP_MD5SIG_EXT:
3614 err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
3615 break;
3616#endif
3617 case TCP_USER_TIMEOUT:
3618
3619
3620
3621 if (val < 0)
3622 err = -EINVAL;
3623 else
3624 icsk->icsk_user_timeout = val;
3625 break;
3626
3627 case TCP_FASTOPEN:
3628 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
3629 TCPF_LISTEN))) {
3630 tcp_fastopen_init_key_once(net);
3631
3632 fastopen_queue_tune(sk, val);
3633 } else {
3634 err = -EINVAL;
3635 }
3636 break;
3637 case TCP_FASTOPEN_CONNECT:
3638 if (val > 1 || val < 0) {
3639 err = -EINVAL;
3640 } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
3641 if (sk->sk_state == TCP_CLOSE)
3642 tp->fastopen_connect = val;
3643 else
3644 err = -EINVAL;
3645 } else {
3646 err = -EOPNOTSUPP;
3647 }
3648 break;
3649 case TCP_FASTOPEN_NO_COOKIE:
3650 if (val > 1 || val < 0)
3651 err = -EINVAL;
3652 else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3653 err = -EINVAL;
3654 else
3655 tp->fastopen_no_cookie = val;
3656 break;
3657 case TCP_TIMESTAMP:
3658 if (!tp->repair)
3659 err = -EPERM;
3660 else
3661 tp->tsoffset = val - tcp_time_stamp_raw();
3662 break;
3663 case TCP_REPAIR_WINDOW:
3664 err = tcp_repair_set_window(tp, optval, optlen);
3665 break;
3666 case TCP_NOTSENT_LOWAT:
3667 tp->notsent_lowat = val;
3668 sk->sk_write_space(sk);
3669 break;
3670 case TCP_INQ:
3671 if (val > 1 || val < 0)
3672 err = -EINVAL;
3673 else
3674 tp->recvmsg_inq = val;
3675 break;
3676 case TCP_TX_DELAY:
3677 if (val)
3678 tcp_enable_tx_delay();
3679 tp->tcp_tx_delay = val;
3680 break;
3681 default:
3682 err = -ENOPROTOOPT;
3683 break;
3684 }
3685
3686 release_sock(sk);
3687 return err;
3688}
3689
3690int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
3691 unsigned int optlen)
3692{
3693 const struct inet_connection_sock *icsk = inet_csk(sk);
3694
3695 if (level != SOL_TCP)
3696 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
3697 optval, optlen);
3698 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
3699}
3700EXPORT_SYMBOL(tcp_setsockopt);
3701
3702static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
3703 struct tcp_info *info)
3704{
3705 u64 stats[__TCP_CHRONO_MAX], total = 0;
3706 enum tcp_chrono i;
3707
3708 for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
3709 stats[i] = tp->chrono_stat[i - 1];
3710 if (i == tp->chrono_type)
3711 stats[i] += tcp_jiffies32 - tp->chrono_start;
3712 stats[i] *= USEC_PER_SEC / HZ;
3713 total += stats[i];
3714 }
3715
3716 info->tcpi_busy_time = total;
3717 info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
3718 info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
3719}
3720
3721
3722void tcp_get_info(struct sock *sk, struct tcp_info *info)
3723{
3724 const struct tcp_sock *tp = tcp_sk(sk);
3725 const struct inet_connection_sock *icsk = inet_csk(sk);
3726 unsigned long rate;
3727 u32 now;
3728 u64 rate64;
3729 bool slow;
3730
3731 memset(info, 0, sizeof(*info));
3732 if (sk->sk_type != SOCK_STREAM)
3733 return;
3734
3735 info->tcpi_state = inet_sk_state_load(sk);
3736
3737
3738 rate = READ_ONCE(sk->sk_pacing_rate);
3739 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3740 info->tcpi_pacing_rate = rate64;
3741
3742 rate = READ_ONCE(sk->sk_max_pacing_rate);
3743 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3744 info->tcpi_max_pacing_rate = rate64;
3745
3746 info->tcpi_reordering = tp->reordering;
3747 info->tcpi_snd_cwnd = tp->snd_cwnd;
3748
3749 if (info->tcpi_state == TCP_LISTEN) {
3750
3751
3752
3753
3754 info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
3755 info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog);
3756 return;
3757 }
3758
3759 slow = lock_sock_fast(sk);
3760
3761 info->tcpi_ca_state = icsk->icsk_ca_state;
3762 info->tcpi_retransmits = icsk->icsk_retransmits;
3763 info->tcpi_probes = icsk->icsk_probes_out;
3764 info->tcpi_backoff = icsk->icsk_backoff;
3765
3766 if (tp->rx_opt.tstamp_ok)
3767 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
3768 if (tcp_is_sack(tp))
3769 info->tcpi_options |= TCPI_OPT_SACK;
3770 if (tp->rx_opt.wscale_ok) {
3771 info->tcpi_options |= TCPI_OPT_WSCALE;
3772 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
3773 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
3774 }
3775
3776 if (tp->ecn_flags & TCP_ECN_OK)
3777 info->tcpi_options |= TCPI_OPT_ECN;
3778 if (tp->ecn_flags & TCP_ECN_SEEN)
3779 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
3780 if (tp->syn_data_acked)
3781 info->tcpi_options |= TCPI_OPT_SYN_DATA;
3782
3783 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
3784 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
3785 info->tcpi_snd_mss = tp->mss_cache;
3786 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
3787
3788 info->tcpi_unacked = tp->packets_out;
3789 info->tcpi_sacked = tp->sacked_out;
3790
3791 info->tcpi_lost = tp->lost_out;
3792 info->tcpi_retrans = tp->retrans_out;
3793
3794 now = tcp_jiffies32;
3795 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
3796 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
3797 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
3798
3799 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
3800 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
3801 info->tcpi_rtt = tp->srtt_us >> 3;
3802 info->tcpi_rttvar = tp->mdev_us >> 2;
3803 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
3804 info->tcpi_advmss = tp->advmss;
3805
3806 info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
3807 info->tcpi_rcv_space = tp->rcvq_space.space;
3808
3809 info->tcpi_total_retrans = tp->total_retrans;
3810
3811 info->tcpi_bytes_acked = tp->bytes_acked;
3812 info->tcpi_bytes_received = tp->bytes_received;
3813 info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
3814 tcp_get_info_chrono_stats(tp, info);
3815
3816 info->tcpi_segs_out = tp->segs_out;
3817
3818
3819 info->tcpi_segs_in = READ_ONCE(tp->segs_in);
3820 info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in);
3821
3822 info->tcpi_min_rtt = tcp_min_rtt(tp);
3823 info->tcpi_data_segs_out = tp->data_segs_out;
3824
3825 info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
3826 rate64 = tcp_compute_delivery_rate(tp);
3827 if (rate64)
3828 info->tcpi_delivery_rate = rate64;
3829 info->tcpi_delivered = tp->delivered;
3830 info->tcpi_delivered_ce = tp->delivered_ce;
3831 info->tcpi_bytes_sent = tp->bytes_sent;
3832 info->tcpi_bytes_retrans = tp->bytes_retrans;
3833 info->tcpi_dsack_dups = tp->dsack_dups;
3834 info->tcpi_reord_seen = tp->reord_seen;
3835 info->tcpi_rcv_ooopack = tp->rcv_ooopack;
3836 info->tcpi_snd_wnd = tp->snd_wnd;
3837 info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
3838 unlock_sock_fast(sk, slow);
3839}
3840EXPORT_SYMBOL_GPL(tcp_get_info);
3841
3842static size_t tcp_opt_stats_get_size(void)
3843{
3844 return
3845 nla_total_size_64bit(sizeof(u64)) +
3846 nla_total_size_64bit(sizeof(u64)) +
3847 nla_total_size_64bit(sizeof(u64)) +
3848 nla_total_size_64bit(sizeof(u64)) +
3849 nla_total_size_64bit(sizeof(u64)) +
3850 nla_total_size_64bit(sizeof(u64)) +
3851 nla_total_size_64bit(sizeof(u64)) +
3852 nla_total_size(sizeof(u32)) +
3853 nla_total_size(sizeof(u32)) +
3854 nla_total_size(sizeof(u32)) +
3855 nla_total_size(sizeof(u8)) +
3856 nla_total_size(sizeof(u8)) +
3857 nla_total_size(sizeof(u32)) +
3858 nla_total_size(sizeof(u8)) +
3859 nla_total_size(sizeof(u32)) +
3860 nla_total_size(sizeof(u32)) +
3861 nla_total_size(sizeof(u32)) +
3862 nla_total_size_64bit(sizeof(u64)) +
3863 nla_total_size_64bit(sizeof(u64)) +
3864 nla_total_size(sizeof(u32)) +
3865 nla_total_size(sizeof(u32)) +
3866 nla_total_size(sizeof(u32)) +
3867 nla_total_size(sizeof(u16)) +
3868 nla_total_size(sizeof(u32)) +
3869 nla_total_size_64bit(sizeof(u64)) +
3870 nla_total_size(sizeof(u8)) +
3871 0;
3872}
3873
3874
3875static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb)
3876{
3877 if (skb->protocol == htons(ETH_P_IP))
3878 return ip_hdr(skb)->ttl;
3879 else if (skb->protocol == htons(ETH_P_IPV6))
3880 return ipv6_hdr(skb)->hop_limit;
3881 else
3882 return 0;
3883}
3884
3885struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
3886 const struct sk_buff *orig_skb,
3887 const struct sk_buff *ack_skb)
3888{
3889 const struct tcp_sock *tp = tcp_sk(sk);
3890 struct sk_buff *stats;
3891 struct tcp_info info;
3892 unsigned long rate;
3893 u64 rate64;
3894
3895 stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
3896 if (!stats)
3897 return NULL;
3898
3899 tcp_get_info_chrono_stats(tp, &info);
3900 nla_put_u64_64bit(stats, TCP_NLA_BUSY,
3901 info.tcpi_busy_time, TCP_NLA_PAD);
3902 nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
3903 info.tcpi_rwnd_limited, TCP_NLA_PAD);
3904 nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
3905 info.tcpi_sndbuf_limited, TCP_NLA_PAD);
3906 nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
3907 tp->data_segs_out, TCP_NLA_PAD);
3908 nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
3909 tp->total_retrans, TCP_NLA_PAD);
3910
3911 rate = READ_ONCE(sk->sk_pacing_rate);
3912 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3913 nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
3914
3915 rate64 = tcp_compute_delivery_rate(tp);
3916 nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
3917
3918 nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
3919 nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
3920 nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
3921
3922 nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
3923 nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
3924 nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
3925 nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
3926 nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
3927
3928 nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
3929 nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
3930
3931 nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
3932 TCP_NLA_PAD);
3933 nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
3934 TCP_NLA_PAD);
3935 nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
3936 nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
3937 nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
3938 nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
3939 nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
3940 max_t(int, 0, tp->write_seq - tp->snd_nxt));
3941 nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
3942 TCP_NLA_PAD);
3943 if (ack_skb)
3944 nla_put_u8(stats, TCP_NLA_TTL,
3945 tcp_skb_ttl_or_hop_limit(ack_skb));
3946
3947 return stats;
3948}
3949
3950static int do_tcp_getsockopt(struct sock *sk, int level,
3951 int optname, char __user *optval, int __user *optlen)
3952{
3953 struct inet_connection_sock *icsk = inet_csk(sk);
3954 struct tcp_sock *tp = tcp_sk(sk);
3955 struct net *net = sock_net(sk);
3956 int val, len;
3957
3958 if (get_user(len, optlen))
3959 return -EFAULT;
3960
3961 len = min_t(unsigned int, len, sizeof(int));
3962
3963 if (len < 0)
3964 return -EINVAL;
3965
3966 switch (optname) {
3967 case TCP_MAXSEG:
3968 val = tp->mss_cache;
3969 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3970 val = tp->rx_opt.user_mss;
3971 if (tp->repair)
3972 val = tp->rx_opt.mss_clamp;
3973 break;
3974 case TCP_NODELAY:
3975 val = !!(tp->nonagle&TCP_NAGLE_OFF);
3976 break;
3977 case TCP_CORK:
3978 val = !!(tp->nonagle&TCP_NAGLE_CORK);
3979 break;
3980 case TCP_KEEPIDLE:
3981 val = keepalive_time_when(tp) / HZ;
3982 break;
3983 case TCP_KEEPINTVL:
3984 val = keepalive_intvl_when(tp) / HZ;
3985 break;
3986 case TCP_KEEPCNT:
3987 val = keepalive_probes(tp);
3988 break;
3989 case TCP_SYNCNT:
3990 val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
3991 break;
3992 case TCP_LINGER2:
3993 val = tp->linger2;
3994 if (val >= 0)
3995 val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
3996 break;
3997 case TCP_DEFER_ACCEPT:
3998 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
3999 TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
4000 break;
4001 case TCP_WINDOW_CLAMP:
4002 val = tp->window_clamp;
4003 break;
4004 case TCP_INFO: {
4005 struct tcp_info info;
4006
4007 if (get_user(len, optlen))
4008 return -EFAULT;
4009
4010 tcp_get_info(sk, &info);
4011
4012 len = min_t(unsigned int, len, sizeof(info));
4013 if (put_user(len, optlen))
4014 return -EFAULT;
4015 if (copy_to_user(optval, &info, len))
4016 return -EFAULT;
4017 return 0;
4018 }
4019 case TCP_CC_INFO: {
4020 const struct tcp_congestion_ops *ca_ops;
4021 union tcp_cc_info info;
4022 size_t sz = 0;
4023 int attr;
4024
4025 if (get_user(len, optlen))
4026 return -EFAULT;
4027
4028 ca_ops = icsk->icsk_ca_ops;
4029 if (ca_ops && ca_ops->get_info)
4030 sz = ca_ops->get_info(sk, ~0U, &attr, &info);
4031
4032 len = min_t(unsigned int, len, sz);
4033 if (put_user(len, optlen))
4034 return -EFAULT;
4035 if (copy_to_user(optval, &info, len))
4036 return -EFAULT;
4037 return 0;
4038 }
4039 case TCP_QUICKACK:
4040 val = !inet_csk_in_pingpong_mode(sk);
4041 break;
4042
4043 case TCP_CONGESTION:
4044 if (get_user(len, optlen))
4045 return -EFAULT;
4046 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
4047 if (put_user(len, optlen))
4048 return -EFAULT;
4049 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
4050 return -EFAULT;
4051 return 0;
4052
4053 case TCP_ULP:
4054 if (get_user(len, optlen))
4055 return -EFAULT;
4056 len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
4057 if (!icsk->icsk_ulp_ops) {
4058 if (put_user(0, optlen))
4059 return -EFAULT;
4060 return 0;
4061 }
4062 if (put_user(len, optlen))
4063 return -EFAULT;
4064 if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
4065 return -EFAULT;
4066 return 0;
4067
4068 case TCP_FASTOPEN_KEY: {
4069 u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)];
4070 unsigned int key_len;
4071
4072 if (get_user(len, optlen))
4073 return -EFAULT;
4074
4075 key_len = tcp_fastopen_get_cipher(net, icsk, key) *
4076 TCP_FASTOPEN_KEY_LENGTH;
4077 len = min_t(unsigned int, len, key_len);
4078 if (put_user(len, optlen))
4079 return -EFAULT;
4080 if (copy_to_user(optval, key, len))
4081 return -EFAULT;
4082 return 0;
4083 }
4084 case TCP_THIN_LINEAR_TIMEOUTS:
4085 val = tp->thin_lto;
4086 break;
4087
4088 case TCP_THIN_DUPACK:
4089 val = 0;
4090 break;
4091
4092 case TCP_REPAIR:
4093 val = tp->repair;
4094 break;
4095
4096 case TCP_REPAIR_QUEUE:
4097 if (tp->repair)
4098 val = tp->repair_queue;
4099 else
4100 return -EINVAL;
4101 break;
4102
4103 case TCP_REPAIR_WINDOW: {
4104 struct tcp_repair_window opt;
4105
4106 if (get_user(len, optlen))
4107 return -EFAULT;
4108
4109 if (len != sizeof(opt))
4110 return -EINVAL;
4111
4112 if (!tp->repair)
4113 return -EPERM;
4114
4115 opt.snd_wl1 = tp->snd_wl1;
4116 opt.snd_wnd = tp->snd_wnd;
4117 opt.max_window = tp->max_window;
4118 opt.rcv_wnd = tp->rcv_wnd;
4119 opt.rcv_wup = tp->rcv_wup;
4120
4121 if (copy_to_user(optval, &opt, len))
4122 return -EFAULT;
4123 return 0;
4124 }
4125 case TCP_QUEUE_SEQ:
4126 if (tp->repair_queue == TCP_SEND_QUEUE)
4127 val = tp->write_seq;
4128 else if (tp->repair_queue == TCP_RECV_QUEUE)
4129 val = tp->rcv_nxt;
4130 else
4131 return -EINVAL;
4132 break;
4133
4134 case TCP_USER_TIMEOUT:
4135 val = icsk->icsk_user_timeout;
4136 break;
4137
4138 case TCP_FASTOPEN:
4139 val = icsk->icsk_accept_queue.fastopenq.max_qlen;
4140 break;
4141
4142 case TCP_FASTOPEN_CONNECT:
4143 val = tp->fastopen_connect;
4144 break;
4145
4146 case TCP_FASTOPEN_NO_COOKIE:
4147 val = tp->fastopen_no_cookie;
4148 break;
4149
4150 case TCP_TX_DELAY:
4151 val = tp->tcp_tx_delay;
4152 break;
4153
4154 case TCP_TIMESTAMP:
4155 val = tcp_time_stamp_raw() + tp->tsoffset;
4156 break;
4157 case TCP_NOTSENT_LOWAT:
4158 val = tp->notsent_lowat;
4159 break;
4160 case TCP_INQ:
4161 val = tp->recvmsg_inq;
4162 break;
4163 case TCP_SAVE_SYN:
4164 val = tp->save_syn;
4165 break;
4166 case TCP_SAVED_SYN: {
4167 if (get_user(len, optlen))
4168 return -EFAULT;
4169
4170 lock_sock(sk);
4171 if (tp->saved_syn) {
4172 if (len < tcp_saved_syn_len(tp->saved_syn)) {
4173 if (put_user(tcp_saved_syn_len(tp->saved_syn),
4174 optlen)) {
4175 release_sock(sk);
4176 return -EFAULT;
4177 }
4178 release_sock(sk);
4179 return -EINVAL;
4180 }
4181 len = tcp_saved_syn_len(tp->saved_syn);
4182 if (put_user(len, optlen)) {
4183 release_sock(sk);
4184 return -EFAULT;
4185 }
4186 if (copy_to_user(optval, tp->saved_syn->data, len)) {
4187 release_sock(sk);
4188 return -EFAULT;
4189 }
4190 tcp_saved_syn_free(tp);
4191 release_sock(sk);
4192 } else {
4193 release_sock(sk);
4194 len = 0;
4195 if (put_user(len, optlen))
4196 return -EFAULT;
4197 }
4198 return 0;
4199 }
4200#ifdef CONFIG_MMU
4201 case TCP_ZEROCOPY_RECEIVE: {
4202 struct scm_timestamping_internal tss;
4203 struct tcp_zerocopy_receive zc = {};
4204 int err;
4205
4206 if (get_user(len, optlen))
4207 return -EFAULT;
4208 if (len < 0 ||
4209 len < offsetofend(struct tcp_zerocopy_receive, length))
4210 return -EINVAL;
4211 if (unlikely(len > sizeof(zc))) {
4212 err = check_zeroed_user(optval + sizeof(zc),
4213 len - sizeof(zc));
4214 if (err < 1)
4215 return err == 0 ? -EINVAL : err;
4216 len = sizeof(zc);
4217 if (put_user(len, optlen))
4218 return -EFAULT;
4219 }
4220 if (copy_from_user(&zc, optval, len))
4221 return -EFAULT;
4222 if (zc.reserved)
4223 return -EINVAL;
4224 if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS))
4225 return -EINVAL;
4226 lock_sock(sk);
4227 err = tcp_zerocopy_receive(sk, &zc, &tss);
4228 err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
4229 &zc, &len, err);
4230 release_sock(sk);
4231 sk_defer_free_flush(sk);
4232 if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
4233 goto zerocopy_rcv_cmsg;
4234 switch (len) {
4235 case offsetofend(struct tcp_zerocopy_receive, msg_flags):
4236 goto zerocopy_rcv_cmsg;
4237 case offsetofend(struct tcp_zerocopy_receive, msg_controllen):
4238 case offsetofend(struct tcp_zerocopy_receive, msg_control):
4239 case offsetofend(struct tcp_zerocopy_receive, flags):
4240 case offsetofend(struct tcp_zerocopy_receive, copybuf_len):
4241 case offsetofend(struct tcp_zerocopy_receive, copybuf_address):
4242 case offsetofend(struct tcp_zerocopy_receive, err):
4243 goto zerocopy_rcv_sk_err;
4244 case offsetofend(struct tcp_zerocopy_receive, inq):
4245 goto zerocopy_rcv_inq;
4246 case offsetofend(struct tcp_zerocopy_receive, length):
4247 default:
4248 goto zerocopy_rcv_out;
4249 }
4250zerocopy_rcv_cmsg:
4251 if (zc.msg_flags & TCP_CMSG_TS)
4252 tcp_zc_finalize_rx_tstamp(sk, &zc, &tss);
4253 else
4254 zc.msg_flags = 0;
4255zerocopy_rcv_sk_err:
4256 if (!err)
4257 zc.err = sock_error(sk);
4258zerocopy_rcv_inq:
4259 zc.inq = tcp_inq_hint(sk);
4260zerocopy_rcv_out:
4261 if (!err && copy_to_user(optval, &zc, len))
4262 err = -EFAULT;
4263 return err;
4264 }
4265#endif
4266 default:
4267 return -ENOPROTOOPT;
4268 }
4269
4270 if (put_user(len, optlen))
4271 return -EFAULT;
4272 if (copy_to_user(optval, &val, len))
4273 return -EFAULT;
4274 return 0;
4275}
4276
4277bool tcp_bpf_bypass_getsockopt(int level, int optname)
4278{
4279
4280
4281
4282 if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
4283 return true;
4284
4285 return false;
4286}
4287EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt);
4288
4289int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
4290 int __user *optlen)
4291{
4292 struct inet_connection_sock *icsk = inet_csk(sk);
4293
4294 if (level != SOL_TCP)
4295 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
4296 optval, optlen);
4297 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
4298}
4299EXPORT_SYMBOL(tcp_getsockopt);
4300
4301#ifdef CONFIG_TCP_MD5SIG
4302static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
4303static DEFINE_MUTEX(tcp_md5sig_mutex);
4304static bool tcp_md5sig_pool_populated = false;
4305
4306static void __tcp_alloc_md5sig_pool(void)
4307{
4308 struct crypto_ahash *hash;
4309 int cpu;
4310
4311 hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
4312 if (IS_ERR(hash))
4313 return;
4314
4315 for_each_possible_cpu(cpu) {
4316 void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
4317 struct ahash_request *req;
4318
4319 if (!scratch) {
4320 scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
4321 sizeof(struct tcphdr),
4322 GFP_KERNEL,
4323 cpu_to_node(cpu));
4324 if (!scratch)
4325 return;
4326 per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
4327 }
4328 if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
4329 continue;
4330
4331 req = ahash_request_alloc(hash, GFP_KERNEL);
4332 if (!req)
4333 return;
4334
4335 ahash_request_set_callback(req, 0, NULL, NULL);
4336
4337 per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
4338 }
4339
4340
4341
4342 smp_wmb();
4343 tcp_md5sig_pool_populated = true;
4344}
4345
4346bool tcp_alloc_md5sig_pool(void)
4347{
4348 if (unlikely(!tcp_md5sig_pool_populated)) {
4349 mutex_lock(&tcp_md5sig_mutex);
4350
4351 if (!tcp_md5sig_pool_populated) {
4352 __tcp_alloc_md5sig_pool();
4353 if (tcp_md5sig_pool_populated)
4354 static_branch_inc(&tcp_md5_needed);
4355 }
4356
4357 mutex_unlock(&tcp_md5sig_mutex);
4358 }
4359 return tcp_md5sig_pool_populated;
4360}
4361EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
4372{
4373 local_bh_disable();
4374
4375 if (tcp_md5sig_pool_populated) {
4376
4377 smp_rmb();
4378 return this_cpu_ptr(&tcp_md5sig_pool);
4379 }
4380 local_bh_enable();
4381 return NULL;
4382}
4383EXPORT_SYMBOL(tcp_get_md5sig_pool);
4384
4385int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
4386 const struct sk_buff *skb, unsigned int header_len)
4387{
4388 struct scatterlist sg;
4389 const struct tcphdr *tp = tcp_hdr(skb);
4390 struct ahash_request *req = hp->md5_req;
4391 unsigned int i;
4392 const unsigned int head_data_len = skb_headlen(skb) > header_len ?
4393 skb_headlen(skb) - header_len : 0;
4394 const struct skb_shared_info *shi = skb_shinfo(skb);
4395 struct sk_buff *frag_iter;
4396
4397 sg_init_table(&sg, 1);
4398
4399 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
4400 ahash_request_set_crypt(req, &sg, NULL, head_data_len);
4401 if (crypto_ahash_update(req))
4402 return 1;
4403
4404 for (i = 0; i < shi->nr_frags; ++i) {
4405 const skb_frag_t *f = &shi->frags[i];
4406 unsigned int offset = skb_frag_off(f);
4407 struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
4408
4409 sg_set_page(&sg, page, skb_frag_size(f),
4410 offset_in_page(offset));
4411 ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
4412 if (crypto_ahash_update(req))
4413 return 1;
4414 }
4415
4416 skb_walk_frags(skb, frag_iter)
4417 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
4418 return 1;
4419
4420 return 0;
4421}
4422EXPORT_SYMBOL(tcp_md5_hash_skb_data);
4423
4424int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
4425{
4426 u8 keylen = READ_ONCE(key->keylen);
4427 struct scatterlist sg;
4428
4429 sg_init_one(&sg, key->key, keylen);
4430 ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen);
4431
4432
4433 return data_race(crypto_ahash_update(hp->md5_req));
4434}
4435EXPORT_SYMBOL(tcp_md5_hash_key);
4436
4437#endif
4438
4439void tcp_done(struct sock *sk)
4440{
4441 struct request_sock *req;
4442
4443
4444
4445
4446
4447 req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);
4448
4449 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
4450 TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
4451
4452 tcp_set_state(sk, TCP_CLOSE);
4453 tcp_clear_xmit_timers(sk);
4454 if (req)
4455 reqsk_fastopen_remove(sk, req, false);
4456
4457 sk->sk_shutdown = SHUTDOWN_MASK;
4458
4459 if (!sock_flag(sk, SOCK_DEAD))
4460 sk->sk_state_change(sk);
4461 else
4462 inet_csk_destroy_sock(sk);
4463}
4464EXPORT_SYMBOL_GPL(tcp_done);
4465
4466int tcp_abort(struct sock *sk, int err)
4467{
4468 if (!sk_fullsock(sk)) {
4469 if (sk->sk_state == TCP_NEW_SYN_RECV) {
4470 struct request_sock *req = inet_reqsk(sk);
4471
4472 local_bh_disable();
4473 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
4474 local_bh_enable();
4475 return 0;
4476 }
4477 return -EOPNOTSUPP;
4478 }
4479
4480
4481 lock_sock(sk);
4482
4483 if (sk->sk_state == TCP_LISTEN) {
4484 tcp_set_state(sk, TCP_CLOSE);
4485 inet_csk_listen_stop(sk);
4486 }
4487
4488
4489 local_bh_disable();
4490 bh_lock_sock(sk);
4491
4492 if (!sock_flag(sk, SOCK_DEAD)) {
4493 sk->sk_err = err;
4494
4495 smp_wmb();
4496 sk_error_report(sk);
4497 if (tcp_need_reset(sk->sk_state))
4498 tcp_send_active_reset(sk, GFP_ATOMIC);
4499 tcp_done(sk);
4500 }
4501
4502 bh_unlock_sock(sk);
4503 local_bh_enable();
4504 tcp_write_queue_purge(sk);
4505 release_sock(sk);
4506 return 0;
4507}
4508EXPORT_SYMBOL_GPL(tcp_abort);
4509
4510extern struct tcp_congestion_ops tcp_reno;
4511
4512static __initdata unsigned long thash_entries;
4513static int __init set_thash_entries(char *str)
4514{
4515 ssize_t ret;
4516
4517 if (!str)
4518 return 0;
4519
4520 ret = kstrtoul(str, 0, &thash_entries);
4521 if (ret)
4522 return 0;
4523
4524 return 1;
4525}
4526__setup("thash_entries=", set_thash_entries);
4527
4528static void __init tcp_init_mem(void)
4529{
4530 unsigned long limit = nr_free_buffer_pages() / 16;
4531
4532 limit = max(limit, 128UL);
4533 sysctl_tcp_mem[0] = limit / 4 * 3;
4534 sysctl_tcp_mem[1] = limit;
4535 sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
4536}
4537
4538void __init tcp_init(void)
4539{
4540 int max_rshare, max_wshare, cnt;
4541 unsigned long limit;
4542 unsigned int i;
4543
4544 BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
4545 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
4546 sizeof_field(struct sk_buff, cb));
4547
4548 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
4549
4550 timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
4551 mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
4552
4553 inet_hashinfo_init(&tcp_hashinfo);
4554 inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
4555 thash_entries, 21,
4556 0, 64 * 1024);
4557 tcp_hashinfo.bind_bucket_cachep =
4558 kmem_cache_create("tcp_bind_bucket",
4559 sizeof(struct inet_bind_bucket), 0,
4560 SLAB_HWCACHE_ALIGN | SLAB_PANIC |
4561 SLAB_ACCOUNT,
4562 NULL);
4563
4564
4565
4566
4567
4568
4569 tcp_hashinfo.ehash =
4570 alloc_large_system_hash("TCP established",
4571 sizeof(struct inet_ehash_bucket),
4572 thash_entries,
4573 17,
4574 0,
4575 NULL,
4576 &tcp_hashinfo.ehash_mask,
4577 0,
4578 thash_entries ? 0 : 512 * 1024);
4579 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
4580 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
4581
4582 if (inet_ehash_locks_alloc(&tcp_hashinfo))
4583 panic("TCP: failed to alloc ehash_locks");
4584 tcp_hashinfo.bhash =
4585 alloc_large_system_hash("TCP bind",
4586 sizeof(struct inet_bind_hashbucket),
4587 tcp_hashinfo.ehash_mask + 1,
4588 17,
4589 0,
4590 &tcp_hashinfo.bhash_size,
4591 NULL,
4592 0,
4593 64 * 1024);
4594 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
4595 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
4596 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
4597 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
4598 }
4599
4600
4601 cnt = tcp_hashinfo.ehash_mask + 1;
4602 sysctl_tcp_max_orphans = cnt / 2;
4603
4604 tcp_init_mem();
4605
4606 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
4607 max_wshare = min(4UL*1024*1024, limit);
4608 max_rshare = min(6UL*1024*1024, limit);
4609
4610 init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
4611 init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
4612 init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
4613
4614 init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
4615 init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
4616 init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
4617
4618 pr_info("Hash tables configured (established %u bind %u)\n",
4619 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
4620
4621 tcp_v4_init();
4622 tcp_metrics_init();
4623 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
4624 tcp_tasklet_init();
4625 mptcp_init();
4626}
4627