1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244#define pr_fmt(fmt) "TCP: " fmt
245
246#include <crypto/hash.h>
247#include <linux/kernel.h>
248#include <linux/module.h>
249#include <linux/types.h>
250#include <linux/fcntl.h>
251#include <linux/poll.h>
252#include <linux/inet_diag.h>
253#include <linux/init.h>
254#include <linux/fs.h>
255#include <linux/skbuff.h>
256#include <linux/scatterlist.h>
257#include <linux/splice.h>
258#include <linux/net.h>
259#include <linux/socket.h>
260#include <linux/random.h>
261#include <linux/memblock.h>
262#include <linux/highmem.h>
263#include <linux/swap.h>
264#include <linux/cache.h>
265#include <linux/err.h>
266#include <linux/time.h>
267#include <linux/slab.h>
268#include <linux/errqueue.h>
269#include <linux/static_key.h>
270
271#include <net/icmp.h>
272#include <net/inet_common.h>
273#include <net/tcp.h>
274#include <net/mptcp.h>
275#include <net/xfrm.h>
276#include <net/ip.h>
277#include <net/sock.h>
278
279#include <linux/uaccess.h>
280#include <asm/ioctls.h>
281#include <net/busy_poll.h>
282
283struct percpu_counter tcp_orphan_count;
284EXPORT_SYMBOL_GPL(tcp_orphan_count);
285
286long sysctl_tcp_mem[3] __read_mostly;
287EXPORT_SYMBOL(sysctl_tcp_mem);
288
289atomic_long_t tcp_memory_allocated;
290EXPORT_SYMBOL(tcp_memory_allocated);
291
292#if IS_ENABLED(CONFIG_SMC)
293DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
294EXPORT_SYMBOL(tcp_have_smc);
295#endif
296
297
298
299
300struct percpu_counter tcp_sockets_allocated;
301EXPORT_SYMBOL(tcp_sockets_allocated);
302
303
304
305
306struct tcp_splice_state {
307 struct pipe_inode_info *pipe;
308 size_t len;
309 unsigned int flags;
310};
311
312
313
314
315
316
317
318unsigned long tcp_memory_pressure __read_mostly;
319EXPORT_SYMBOL_GPL(tcp_memory_pressure);
320
321DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
322EXPORT_SYMBOL(tcp_rx_skb_cache_key);
323
324DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
325
326void tcp_enter_memory_pressure(struct sock *sk)
327{
328 unsigned long val;
329
330 if (READ_ONCE(tcp_memory_pressure))
331 return;
332 val = jiffies;
333
334 if (!val)
335 val--;
336 if (!cmpxchg(&tcp_memory_pressure, 0, val))
337 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
338}
339EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
340
341void tcp_leave_memory_pressure(struct sock *sk)
342{
343 unsigned long val;
344
345 if (!READ_ONCE(tcp_memory_pressure))
346 return;
347 val = xchg(&tcp_memory_pressure, 0);
348 if (val)
349 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
350 jiffies_to_msecs(jiffies - val));
351}
352EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
353
354
355static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
356{
357 u8 res = 0;
358
359 if (seconds > 0) {
360 int period = timeout;
361
362 res = 1;
363 while (seconds > period && res < 255) {
364 res++;
365 timeout <<= 1;
366 if (timeout > rto_max)
367 timeout = rto_max;
368 period += timeout;
369 }
370 }
371 return res;
372}
373
374
375static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
376{
377 int period = 0;
378
379 if (retrans > 0) {
380 period = timeout;
381 while (--retrans) {
382 timeout <<= 1;
383 if (timeout > rto_max)
384 timeout = rto_max;
385 period += timeout;
386 }
387 }
388 return period;
389}
390
391static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
392{
393 u32 rate = READ_ONCE(tp->rate_delivered);
394 u32 intv = READ_ONCE(tp->rate_interval_us);
395 u64 rate64 = 0;
396
397 if (rate && intv) {
398 rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
399 do_div(rate64, intv);
400 }
401 return rate64;
402}
403
404
405
406
407
408
409void tcp_init_sock(struct sock *sk)
410{
411 struct inet_connection_sock *icsk = inet_csk(sk);
412 struct tcp_sock *tp = tcp_sk(sk);
413
414 tp->out_of_order_queue = RB_ROOT;
415 sk->tcp_rtx_queue = RB_ROOT;
416 tcp_init_xmit_timers(sk);
417 INIT_LIST_HEAD(&tp->tsq_node);
418 INIT_LIST_HEAD(&tp->tsorted_sent_queue);
419
420 icsk->icsk_rto = TCP_TIMEOUT_INIT;
421 icsk->icsk_rto_min = TCP_RTO_MIN;
422 icsk->icsk_delack_max = TCP_DELACK_MAX;
423 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
424 minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
425
426
427
428
429
430
431 tp->snd_cwnd = TCP_INIT_CWND;
432
433
434 tp->app_limited = ~0U;
435
436
437
438
439 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
440 tp->snd_cwnd_clamp = ~0;
441 tp->mss_cache = TCP_MSS_DEFAULT;
442
443 tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
444 tcp_assign_congestion_control(sk);
445
446 tp->tsoffset = 0;
447 tp->rack.reo_wnd_steps = 1;
448
449 sk->sk_write_space = sk_stream_write_space;
450 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
451
452 icsk->icsk_sync_mss = tcp_sync_mss;
453
454 WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]);
455 WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
456
457 sk_sockets_allocated_inc(sk);
458 sk->sk_route_forced_caps = NETIF_F_GSO;
459}
460EXPORT_SYMBOL(tcp_init_sock);
461
462static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
463{
464 struct sk_buff *skb = tcp_write_queue_tail(sk);
465
466 if (tsflags && skb) {
467 struct skb_shared_info *shinfo = skb_shinfo(skb);
468 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
469
470 sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
471 if (tsflags & SOF_TIMESTAMPING_TX_ACK)
472 tcb->txstamp_ack = 1;
473 if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
474 shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
475 }
476}
477
478static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
479 int target, struct sock *sk)
480{
481 int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq);
482
483 if (avail > 0) {
484 if (avail >= target)
485 return true;
486 if (tcp_rmem_pressure(sk))
487 return true;
488 if (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss)
489 return true;
490 }
491 if (sk->sk_prot->stream_memory_read)
492 return sk->sk_prot->stream_memory_read(sk);
493 return false;
494}
495
496
497
498
499
500
501
502
503__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
504{
505 __poll_t mask;
506 struct sock *sk = sock->sk;
507 const struct tcp_sock *tp = tcp_sk(sk);
508 int state;
509
510 sock_poll_wait(file, sock, wait);
511
512 state = inet_sk_state_load(sk);
513 if (state == TCP_LISTEN)
514 return inet_csk_listen_poll(sk);
515
516
517
518
519
520
521 mask = 0;
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550 if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
551 mask |= EPOLLHUP;
552 if (sk->sk_shutdown & RCV_SHUTDOWN)
553 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
554
555
556 if (state != TCP_SYN_SENT &&
557 (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
558 int target = sock_rcvlowat(sk, 0, INT_MAX);
559
560 if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
561 !sock_flag(sk, SOCK_URGINLINE) &&
562 tp->urg_data)
563 target++;
564
565 if (tcp_stream_is_readable(tp, target, sk))
566 mask |= EPOLLIN | EPOLLRDNORM;
567
568 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
569 if (__sk_stream_is_writeable(sk, 1)) {
570 mask |= EPOLLOUT | EPOLLWRNORM;
571 } else {
572 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
573 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
574
575
576
577
578
579
580 smp_mb__after_atomic();
581 if (__sk_stream_is_writeable(sk, 1))
582 mask |= EPOLLOUT | EPOLLWRNORM;
583 }
584 } else
585 mask |= EPOLLOUT | EPOLLWRNORM;
586
587 if (tp->urg_data & TCP_URG_VALID)
588 mask |= EPOLLPRI;
589 } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
590
591
592
593
594 mask |= EPOLLOUT | EPOLLWRNORM;
595 }
596
597 smp_rmb();
598 if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
599 mask |= EPOLLERR;
600
601 return mask;
602}
603EXPORT_SYMBOL(tcp_poll);
604
605int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
606{
607 struct tcp_sock *tp = tcp_sk(sk);
608 int answ;
609 bool slow;
610
611 switch (cmd) {
612 case SIOCINQ:
613 if (sk->sk_state == TCP_LISTEN)
614 return -EINVAL;
615
616 slow = lock_sock_fast(sk);
617 answ = tcp_inq(sk);
618 unlock_sock_fast(sk, slow);
619 break;
620 case SIOCATMARK:
621 answ = tp->urg_data &&
622 READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
623 break;
624 case SIOCOUTQ:
625 if (sk->sk_state == TCP_LISTEN)
626 return -EINVAL;
627
628 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
629 answ = 0;
630 else
631 answ = READ_ONCE(tp->write_seq) - tp->snd_una;
632 break;
633 case SIOCOUTQNSD:
634 if (sk->sk_state == TCP_LISTEN)
635 return -EINVAL;
636
637 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
638 answ = 0;
639 else
640 answ = READ_ONCE(tp->write_seq) -
641 READ_ONCE(tp->snd_nxt);
642 break;
643 default:
644 return -ENOIOCTLCMD;
645 }
646
647 return put_user(answ, (int __user *)arg);
648}
649EXPORT_SYMBOL(tcp_ioctl);
650
651static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
652{
653 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
654 tp->pushed_seq = tp->write_seq;
655}
656
657static inline bool forced_push(const struct tcp_sock *tp)
658{
659 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
660}
661
662static void skb_entail(struct sock *sk, struct sk_buff *skb)
663{
664 struct tcp_sock *tp = tcp_sk(sk);
665 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
666
667 skb->csum = 0;
668 tcb->seq = tcb->end_seq = tp->write_seq;
669 tcb->tcp_flags = TCPHDR_ACK;
670 tcb->sacked = 0;
671 __skb_header_release(skb);
672 tcp_add_write_queue_tail(sk, skb);
673 sk_wmem_queued_add(sk, skb->truesize);
674 sk_mem_charge(sk, skb->truesize);
675 if (tp->nonagle & TCP_NAGLE_PUSH)
676 tp->nonagle &= ~TCP_NAGLE_PUSH;
677
678 tcp_slow_start_after_idle_check(sk);
679}
680
681static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
682{
683 if (flags & MSG_OOB)
684 tp->snd_up = tp->write_seq;
685}
686
687
688
689
690
691
692
693
694
695
696
697static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
698 int size_goal)
699{
700 return skb->len < size_goal &&
701 sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
702 !tcp_rtx_queue_empty(sk) &&
703 refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
704}
705
706void tcp_push(struct sock *sk, int flags, int mss_now,
707 int nonagle, int size_goal)
708{
709 struct tcp_sock *tp = tcp_sk(sk);
710 struct sk_buff *skb;
711
712 skb = tcp_write_queue_tail(sk);
713 if (!skb)
714 return;
715 if (!(flags & MSG_MORE) || forced_push(tp))
716 tcp_mark_push(tp, skb);
717
718 tcp_mark_urg(tp, flags);
719
720 if (tcp_should_autocork(sk, skb, size_goal)) {
721
722
723 if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
724 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
725 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
726 }
727
728
729
730 if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
731 return;
732 }
733
734 if (flags & MSG_MORE)
735 nonagle = TCP_NAGLE_CORK;
736
737 __tcp_push_pending_frames(sk, mss_now, nonagle);
738}
739
740static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
741 unsigned int offset, size_t len)
742{
743 struct tcp_splice_state *tss = rd_desc->arg.data;
744 int ret;
745
746 ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
747 min(rd_desc->count, len), tss->flags);
748 if (ret > 0)
749 rd_desc->count -= ret;
750 return ret;
751}
752
753static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
754{
755
756 read_descriptor_t rd_desc = {
757 .arg.data = tss,
758 .count = tss->len,
759 };
760
761 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
762}
763
764
765
766
767
768
769
770
771
772
773
774
775
776ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
777 struct pipe_inode_info *pipe, size_t len,
778 unsigned int flags)
779{
780 struct sock *sk = sock->sk;
781 struct tcp_splice_state tss = {
782 .pipe = pipe,
783 .len = len,
784 .flags = flags,
785 };
786 long timeo;
787 ssize_t spliced;
788 int ret;
789
790 sock_rps_record_flow(sk);
791
792
793
794 if (unlikely(*ppos))
795 return -ESPIPE;
796
797 ret = spliced = 0;
798
799 lock_sock(sk);
800
801 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
802 while (tss.len) {
803 ret = __tcp_splice_read(sk, &tss);
804 if (ret < 0)
805 break;
806 else if (!ret) {
807 if (spliced)
808 break;
809 if (sock_flag(sk, SOCK_DONE))
810 break;
811 if (sk->sk_err) {
812 ret = sock_error(sk);
813 break;
814 }
815 if (sk->sk_shutdown & RCV_SHUTDOWN)
816 break;
817 if (sk->sk_state == TCP_CLOSE) {
818
819
820
821
822 ret = -ENOTCONN;
823 break;
824 }
825 if (!timeo) {
826 ret = -EAGAIN;
827 break;
828 }
829
830
831
832
833 if (!skb_queue_empty(&sk->sk_receive_queue))
834 break;
835 sk_wait_data(sk, &timeo, NULL);
836 if (signal_pending(current)) {
837 ret = sock_intr_errno(timeo);
838 break;
839 }
840 continue;
841 }
842 tss.len -= ret;
843 spliced += ret;
844
845 if (!timeo)
846 break;
847 release_sock(sk);
848 lock_sock(sk);
849
850 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
851 (sk->sk_shutdown & RCV_SHUTDOWN) ||
852 signal_pending(current))
853 break;
854 }
855
856 release_sock(sk);
857
858 if (spliced)
859 return spliced;
860
861 return ret;
862}
863EXPORT_SYMBOL(tcp_splice_read);
864
865struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
866 bool force_schedule)
867{
868 struct sk_buff *skb;
869
870 if (likely(!size)) {
871 skb = sk->sk_tx_skb_cache;
872 if (skb) {
873 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
874 sk->sk_tx_skb_cache = NULL;
875 pskb_trim(skb, 0);
876 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
877 skb_shinfo(skb)->tx_flags = 0;
878 memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb));
879 return skb;
880 }
881 }
882
883 size = ALIGN(size, 4);
884
885 if (unlikely(tcp_under_memory_pressure(sk)))
886 sk_mem_reclaim_partial(sk);
887
888 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
889 if (likely(skb)) {
890 bool mem_scheduled;
891
892 if (force_schedule) {
893 mem_scheduled = true;
894 sk_forced_mem_schedule(sk, skb->truesize);
895 } else {
896 mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
897 }
898 if (likely(mem_scheduled)) {
899 skb_reserve(skb, sk->sk_prot->max_header);
900
901
902
903
904 skb->reserved_tailroom = skb->end - skb->tail - size;
905 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
906 return skb;
907 }
908 __kfree_skb(skb);
909 } else {
910 sk->sk_prot->enter_memory_pressure(sk);
911 sk_stream_moderate_sndbuf(sk);
912 }
913 return NULL;
914}
915
916static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
917 int large_allowed)
918{
919 struct tcp_sock *tp = tcp_sk(sk);
920 u32 new_size_goal, size_goal;
921
922 if (!large_allowed)
923 return mss_now;
924
925
926 new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
927 new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
928
929
930 size_goal = tp->gso_segs * mss_now;
931 if (unlikely(new_size_goal < size_goal ||
932 new_size_goal >= size_goal + mss_now)) {
933 tp->gso_segs = min_t(u16, new_size_goal / mss_now,
934 sk->sk_gso_max_segs);
935 size_goal = tp->gso_segs * mss_now;
936 }
937
938 return max(size_goal, mss_now);
939}
940
941int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
942{
943 int mss_now;
944
945 mss_now = tcp_current_mss(sk);
946 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
947
948 return mss_now;
949}
950
951
952
953
954
955
956
957void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
958{
959 if (skb && !skb->len) {
960 tcp_unlink_write_queue(skb, sk);
961 if (tcp_write_queue_empty(sk))
962 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
963 sk_wmem_free_skb(sk, skb);
964 }
965}
966
967struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
968 struct page *page, int offset, size_t *size)
969{
970 struct sk_buff *skb = tcp_write_queue_tail(sk);
971 struct tcp_sock *tp = tcp_sk(sk);
972 bool can_coalesce;
973 int copy, i;
974
975 if (!skb || (copy = size_goal - skb->len) <= 0 ||
976 !tcp_skb_can_collapse_to(skb)) {
977new_segment:
978 if (!sk_stream_memory_free(sk))
979 return NULL;
980
981 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
982 tcp_rtx_and_write_queues_empty(sk));
983 if (!skb)
984 return NULL;
985
986#ifdef CONFIG_TLS_DEVICE
987 skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
988#endif
989 skb_entail(sk, skb);
990 copy = size_goal;
991 }
992
993 if (copy > *size)
994 copy = *size;
995
996 i = skb_shinfo(skb)->nr_frags;
997 can_coalesce = skb_can_coalesce(skb, i, page, offset);
998 if (!can_coalesce && i >= sysctl_max_skb_frags) {
999 tcp_mark_push(tp, skb);
1000 goto new_segment;
1001 }
1002 if (!sk_wmem_schedule(sk, copy))
1003 return NULL;
1004
1005 if (can_coalesce) {
1006 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1007 } else {
1008 get_page(page);
1009 skb_fill_page_desc(skb, i, page, offset, copy);
1010 }
1011
1012 if (!(flags & MSG_NO_SHARED_FRAGS))
1013 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
1014
1015 skb->len += copy;
1016 skb->data_len += copy;
1017 skb->truesize += copy;
1018 sk_wmem_queued_add(sk, copy);
1019 sk_mem_charge(sk, copy);
1020 skb->ip_summed = CHECKSUM_PARTIAL;
1021 WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
1022 TCP_SKB_CB(skb)->end_seq += copy;
1023 tcp_skb_pcount_set(skb, 0);
1024
1025 *size = copy;
1026 return skb;
1027}
1028
1029ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
1030 size_t size, int flags)
1031{
1032 struct tcp_sock *tp = tcp_sk(sk);
1033 int mss_now, size_goal;
1034 int err;
1035 ssize_t copied;
1036 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1037
1038 if (IS_ENABLED(CONFIG_DEBUG_VM) &&
1039 WARN_ONCE(!sendpage_ok(page),
1040 "page must not be a Slab one and have page_count > 0"))
1041 return -EINVAL;
1042
1043
1044
1045
1046
1047 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1048 !tcp_passive_fastopen(sk)) {
1049 err = sk_stream_wait_connect(sk, &timeo);
1050 if (err != 0)
1051 goto out_err;
1052 }
1053
1054 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1055
1056 mss_now = tcp_send_mss(sk, &size_goal, flags);
1057 copied = 0;
1058
1059 err = -EPIPE;
1060 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1061 goto out_err;
1062
1063 while (size > 0) {
1064 struct sk_buff *skb;
1065 size_t copy = size;
1066
1067 skb = tcp_build_frag(sk, size_goal, flags, page, offset, ©);
1068 if (!skb)
1069 goto wait_for_space;
1070
1071 if (!copied)
1072 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1073
1074 copied += copy;
1075 offset += copy;
1076 size -= copy;
1077 if (!size)
1078 goto out;
1079
1080 if (skb->len < size_goal || (flags & MSG_OOB))
1081 continue;
1082
1083 if (forced_push(tp)) {
1084 tcp_mark_push(tp, skb);
1085 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1086 } else if (skb == tcp_send_head(sk))
1087 tcp_push_one(sk, mss_now);
1088 continue;
1089
1090wait_for_space:
1091 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1092 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1093 TCP_NAGLE_PUSH, size_goal);
1094
1095 err = sk_stream_wait_memory(sk, &timeo);
1096 if (err != 0)
1097 goto do_error;
1098
1099 mss_now = tcp_send_mss(sk, &size_goal, flags);
1100 }
1101
1102out:
1103 if (copied) {
1104 tcp_tx_timestamp(sk, sk->sk_tsflags);
1105 if (!(flags & MSG_SENDPAGE_NOTLAST))
1106 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1107 }
1108 return copied;
1109
1110do_error:
1111 tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk));
1112 if (copied)
1113 goto out;
1114out_err:
1115
1116 if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
1117 sk->sk_write_space(sk);
1118 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1119 }
1120 return sk_stream_error(sk, flags, err);
1121}
1122EXPORT_SYMBOL_GPL(do_tcp_sendpages);
1123
1124int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
1125 size_t size, int flags)
1126{
1127 if (!(sk->sk_route_caps & NETIF_F_SG))
1128 return sock_no_sendpage_locked(sk, page, offset, size, flags);
1129
1130 tcp_rate_check_app_limited(sk);
1131
1132 return do_tcp_sendpages(sk, page, offset, size, flags);
1133}
1134EXPORT_SYMBOL_GPL(tcp_sendpage_locked);
1135
1136int tcp_sendpage(struct sock *sk, struct page *page, int offset,
1137 size_t size, int flags)
1138{
1139 int ret;
1140
1141 lock_sock(sk);
1142 ret = tcp_sendpage_locked(sk, page, offset, size, flags);
1143 release_sock(sk);
1144
1145 return ret;
1146}
1147EXPORT_SYMBOL(tcp_sendpage);
1148
1149void tcp_free_fastopen_req(struct tcp_sock *tp)
1150{
1151 if (tp->fastopen_req) {
1152 kfree(tp->fastopen_req);
1153 tp->fastopen_req = NULL;
1154 }
1155}
1156
1157static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1158 int *copied, size_t size,
1159 struct ubuf_info *uarg)
1160{
1161 struct tcp_sock *tp = tcp_sk(sk);
1162 struct inet_sock *inet = inet_sk(sk);
1163 struct sockaddr *uaddr = msg->msg_name;
1164 int err, flags;
1165
1166 if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
1167 (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
1168 uaddr->sa_family == AF_UNSPEC))
1169 return -EOPNOTSUPP;
1170 if (tp->fastopen_req)
1171 return -EALREADY;
1172
1173 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1174 sk->sk_allocation);
1175 if (unlikely(!tp->fastopen_req))
1176 return -ENOBUFS;
1177 tp->fastopen_req->data = msg;
1178 tp->fastopen_req->size = size;
1179 tp->fastopen_req->uarg = uarg;
1180
1181 if (inet->defer_connect) {
1182 err = tcp_connect(sk);
1183
1184 if (err) {
1185 tcp_set_state(sk, TCP_CLOSE);
1186 inet->inet_dport = 0;
1187 sk->sk_route_caps = 0;
1188 }
1189 }
1190 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1191 err = __inet_stream_connect(sk->sk_socket, uaddr,
1192 msg->msg_namelen, flags, 1);
1193
1194
1195
1196 if (tp->fastopen_req) {
1197 *copied = tp->fastopen_req->copied;
1198 tcp_free_fastopen_req(tp);
1199 inet->defer_connect = 0;
1200 }
1201 return err;
1202}
1203
1204int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
1205{
1206 struct tcp_sock *tp = tcp_sk(sk);
1207 struct ubuf_info *uarg = NULL;
1208 struct sk_buff *skb;
1209 struct sockcm_cookie sockc;
1210 int flags, err, copied = 0;
1211 int mss_now = 0, size_goal, copied_syn = 0;
1212 int process_backlog = 0;
1213 bool zc = false;
1214 long timeo;
1215
1216 flags = msg->msg_flags;
1217
1218 if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
1219 skb = tcp_write_queue_tail(sk);
1220 uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
1221 if (!uarg) {
1222 err = -ENOBUFS;
1223 goto out_err;
1224 }
1225
1226 zc = sk->sk_route_caps & NETIF_F_SG;
1227 if (!zc)
1228 uarg->zerocopy = 0;
1229 }
1230
1231 if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
1232 !tp->repair) {
1233 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
1234 if (err == -EINPROGRESS && copied_syn > 0)
1235 goto out;
1236 else if (err)
1237 goto out_err;
1238 }
1239
1240 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1241
1242 tcp_rate_check_app_limited(sk);
1243
1244
1245
1246
1247
1248 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1249 !tcp_passive_fastopen(sk)) {
1250 err = sk_stream_wait_connect(sk, &timeo);
1251 if (err != 0)
1252 goto do_error;
1253 }
1254
1255 if (unlikely(tp->repair)) {
1256 if (tp->repair_queue == TCP_RECV_QUEUE) {
1257 copied = tcp_send_rcvq(sk, msg, size);
1258 goto out_nopush;
1259 }
1260
1261 err = -EINVAL;
1262 if (tp->repair_queue == TCP_NO_QUEUE)
1263 goto out_err;
1264
1265
1266 }
1267
1268 sockcm_init(&sockc, sk);
1269 if (msg->msg_controllen) {
1270 err = sock_cmsg_send(sk, msg, &sockc);
1271 if (unlikely(err)) {
1272 err = -EINVAL;
1273 goto out_err;
1274 }
1275 }
1276
1277
1278 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1279
1280
1281 copied = 0;
1282
1283restart:
1284 mss_now = tcp_send_mss(sk, &size_goal, flags);
1285
1286 err = -EPIPE;
1287 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1288 goto do_error;
1289
1290 while (msg_data_left(msg)) {
1291 int copy = 0;
1292
1293 skb = tcp_write_queue_tail(sk);
1294 if (skb)
1295 copy = size_goal - skb->len;
1296
1297 if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
1298 bool first_skb;
1299
1300new_segment:
1301 if (!sk_stream_memory_free(sk))
1302 goto wait_for_space;
1303
1304 if (unlikely(process_backlog >= 16)) {
1305 process_backlog = 0;
1306 if (sk_flush_backlog(sk))
1307 goto restart;
1308 }
1309 first_skb = tcp_rtx_and_write_queues_empty(sk);
1310 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
1311 first_skb);
1312 if (!skb)
1313 goto wait_for_space;
1314
1315 process_backlog++;
1316 skb->ip_summed = CHECKSUM_PARTIAL;
1317
1318 skb_entail(sk, skb);
1319 copy = size_goal;
1320
1321
1322
1323
1324
1325 if (tp->repair)
1326 TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
1327 }
1328
1329
1330 if (copy > msg_data_left(msg))
1331 copy = msg_data_left(msg);
1332
1333
1334 if (skb_availroom(skb) > 0 && !zc) {
1335
1336 copy = min_t(int, copy, skb_availroom(skb));
1337 err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
1338 if (err)
1339 goto do_fault;
1340 } else if (!zc) {
1341 bool merge = true;
1342 int i = skb_shinfo(skb)->nr_frags;
1343 struct page_frag *pfrag = sk_page_frag(sk);
1344
1345 if (!sk_page_frag_refill(sk, pfrag))
1346 goto wait_for_space;
1347
1348 if (!skb_can_coalesce(skb, i, pfrag->page,
1349 pfrag->offset)) {
1350 if (i >= sysctl_max_skb_frags) {
1351 tcp_mark_push(tp, skb);
1352 goto new_segment;
1353 }
1354 merge = false;
1355 }
1356
1357 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1358
1359 if (!sk_wmem_schedule(sk, copy))
1360 goto wait_for_space;
1361
1362 err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
1363 pfrag->page,
1364 pfrag->offset,
1365 copy);
1366 if (err)
1367 goto do_error;
1368
1369
1370 if (merge) {
1371 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1372 } else {
1373 skb_fill_page_desc(skb, i, pfrag->page,
1374 pfrag->offset, copy);
1375 page_ref_inc(pfrag->page);
1376 }
1377 pfrag->offset += copy;
1378 } else {
1379 err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
1380 if (err == -EMSGSIZE || err == -EEXIST) {
1381 tcp_mark_push(tp, skb);
1382 goto new_segment;
1383 }
1384 if (err < 0)
1385 goto do_error;
1386 copy = err;
1387 }
1388
1389 if (!copied)
1390 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1391
1392 WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
1393 TCP_SKB_CB(skb)->end_seq += copy;
1394 tcp_skb_pcount_set(skb, 0);
1395
1396 copied += copy;
1397 if (!msg_data_left(msg)) {
1398 if (unlikely(flags & MSG_EOR))
1399 TCP_SKB_CB(skb)->eor = 1;
1400 goto out;
1401 }
1402
1403 if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair))
1404 continue;
1405
1406 if (forced_push(tp)) {
1407 tcp_mark_push(tp, skb);
1408 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1409 } else if (skb == tcp_send_head(sk))
1410 tcp_push_one(sk, mss_now);
1411 continue;
1412
1413wait_for_space:
1414 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1415 if (copied)
1416 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1417 TCP_NAGLE_PUSH, size_goal);
1418
1419 err = sk_stream_wait_memory(sk, &timeo);
1420 if (err != 0)
1421 goto do_error;
1422
1423 mss_now = tcp_send_mss(sk, &size_goal, flags);
1424 }
1425
1426out:
1427 if (copied) {
1428 tcp_tx_timestamp(sk, sockc.tsflags);
1429 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1430 }
1431out_nopush:
1432 sock_zerocopy_put(uarg);
1433 return copied + copied_syn;
1434
1435do_error:
1436 skb = tcp_write_queue_tail(sk);
1437do_fault:
1438 tcp_remove_empty_skb(sk, skb);
1439
1440 if (copied + copied_syn)
1441 goto out;
1442out_err:
1443 sock_zerocopy_put_abort(uarg, true);
1444 err = sk_stream_error(sk, flags, err);
1445
1446 if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
1447 sk->sk_write_space(sk);
1448 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1449 }
1450 return err;
1451}
1452EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
1453
1454int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
1455{
1456 int ret;
1457
1458 lock_sock(sk);
1459 ret = tcp_sendmsg_locked(sk, msg, size);
1460 release_sock(sk);
1461
1462 return ret;
1463}
1464EXPORT_SYMBOL(tcp_sendmsg);
1465
1466
1467
1468
1469
1470
1471static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1472{
1473 struct tcp_sock *tp = tcp_sk(sk);
1474
1475
1476 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1477 tp->urg_data == TCP_URG_READ)
1478 return -EINVAL;
1479
1480 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1481 return -ENOTCONN;
1482
1483 if (tp->urg_data & TCP_URG_VALID) {
1484 int err = 0;
1485 char c = tp->urg_data;
1486
1487 if (!(flags & MSG_PEEK))
1488 tp->urg_data = TCP_URG_READ;
1489
1490
1491 msg->msg_flags |= MSG_OOB;
1492
1493 if (len > 0) {
1494 if (!(flags & MSG_TRUNC))
1495 err = memcpy_to_msg(msg, &c, 1);
1496 len = 1;
1497 } else
1498 msg->msg_flags |= MSG_TRUNC;
1499
1500 return err ? -EFAULT : len;
1501 }
1502
1503 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1504 return 0;
1505
1506
1507
1508
1509
1510
1511
1512 return -EAGAIN;
1513}
1514
1515static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1516{
1517 struct sk_buff *skb;
1518 int copied = 0, err = 0;
1519
1520
1521
1522 skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
1523 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1524 if (err)
1525 return err;
1526 copied += skb->len;
1527 }
1528
1529 skb_queue_walk(&sk->sk_write_queue, skb) {
1530 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1531 if (err)
1532 break;
1533
1534 copied += skb->len;
1535 }
1536
1537 return err ?: copied;
1538}
1539
1540
1541
1542
1543
1544
1545
1546void tcp_cleanup_rbuf(struct sock *sk, int copied)
1547{
1548 struct tcp_sock *tp = tcp_sk(sk);
1549 bool time_to_ack = false;
1550
1551 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1552
1553 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1554 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1555 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1556
1557 if (inet_csk_ack_scheduled(sk)) {
1558 const struct inet_connection_sock *icsk = inet_csk(sk);
1559
1560 if (
1561 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1562
1563
1564
1565
1566
1567
1568 (copied > 0 &&
1569 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1570 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1571 !inet_csk_in_pingpong_mode(sk))) &&
1572 !atomic_read(&sk->sk_rmem_alloc)))
1573 time_to_ack = true;
1574 }
1575
1576
1577
1578
1579
1580
1581
1582 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1583 __u32 rcv_window_now = tcp_receive_window(tp);
1584
1585
1586 if (2*rcv_window_now <= tp->window_clamp) {
1587 __u32 new_window = __tcp_select_window(sk);
1588
1589
1590
1591
1592
1593
1594 if (new_window && new_window >= 2 * rcv_window_now)
1595 time_to_ack = true;
1596 }
1597 }
1598 if (time_to_ack)
1599 tcp_send_ack(sk);
1600}
1601
1602static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1603{
1604 struct sk_buff *skb;
1605 u32 offset;
1606
1607 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1608 offset = seq - TCP_SKB_CB(skb)->seq;
1609 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
1610 pr_err_once("%s: found a SYN, please report !\n", __func__);
1611 offset--;
1612 }
1613 if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
1614 *off = offset;
1615 return skb;
1616 }
1617
1618
1619
1620
1621 sk_eat_skb(sk, skb);
1622 }
1623 return NULL;
1624}
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1638 sk_read_actor_t recv_actor)
1639{
1640 struct sk_buff *skb;
1641 struct tcp_sock *tp = tcp_sk(sk);
1642 u32 seq = tp->copied_seq;
1643 u32 offset;
1644 int copied = 0;
1645
1646 if (sk->sk_state == TCP_LISTEN)
1647 return -ENOTCONN;
1648 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1649 if (offset < skb->len) {
1650 int used;
1651 size_t len;
1652
1653 len = skb->len - offset;
1654
1655 if (tp->urg_data) {
1656 u32 urg_offset = tp->urg_seq - seq;
1657 if (urg_offset < len)
1658 len = urg_offset;
1659 if (!len)
1660 break;
1661 }
1662 used = recv_actor(desc, skb, offset, len);
1663 if (used <= 0) {
1664 if (!copied)
1665 copied = used;
1666 break;
1667 } else if (used <= len) {
1668 seq += used;
1669 copied += used;
1670 offset += used;
1671 }
1672
1673
1674
1675
1676
1677 skb = tcp_recv_skb(sk, seq - 1, &offset);
1678 if (!skb)
1679 break;
1680
1681
1682
1683 if (offset + 1 != skb->len)
1684 continue;
1685 }
1686 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1687 sk_eat_skb(sk, skb);
1688 ++seq;
1689 break;
1690 }
1691 sk_eat_skb(sk, skb);
1692 if (!desc->count)
1693 break;
1694 WRITE_ONCE(tp->copied_seq, seq);
1695 }
1696 WRITE_ONCE(tp->copied_seq, seq);
1697
1698 tcp_rcv_space_adjust(sk);
1699
1700
1701 if (copied > 0) {
1702 tcp_recv_skb(sk, seq, &offset);
1703 tcp_cleanup_rbuf(sk, copied);
1704 }
1705 return copied;
1706}
1707EXPORT_SYMBOL(tcp_read_sock);
1708
1709int tcp_peek_len(struct socket *sock)
1710{
1711 return tcp_inq(sock->sk);
1712}
1713EXPORT_SYMBOL(tcp_peek_len);
1714
1715
1716int tcp_set_rcvlowat(struct sock *sk, int val)
1717{
1718 int cap;
1719
1720 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1721 cap = sk->sk_rcvbuf >> 1;
1722 else
1723 cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1;
1724 val = min(val, cap);
1725 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1726
1727
1728 tcp_data_ready(sk);
1729
1730 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1731 return 0;
1732
1733 val <<= 1;
1734 if (val > sk->sk_rcvbuf) {
1735 WRITE_ONCE(sk->sk_rcvbuf, val);
1736 tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
1737 }
1738 return 0;
1739}
1740EXPORT_SYMBOL(tcp_set_rcvlowat);
1741
1742#ifdef CONFIG_MMU
1743static const struct vm_operations_struct tcp_vm_ops = {
1744};
1745
1746int tcp_mmap(struct file *file, struct socket *sock,
1747 struct vm_area_struct *vma)
1748{
1749 if (vma->vm_flags & (VM_WRITE | VM_EXEC))
1750 return -EPERM;
1751 vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
1752
1753
1754 vma->vm_flags |= VM_MIXEDMAP;
1755
1756 vma->vm_ops = &tcp_vm_ops;
1757 return 0;
1758}
1759EXPORT_SYMBOL(tcp_mmap);
1760
1761static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb,
1762 u32 *offset_frag)
1763{
1764 skb_frag_t *frag;
1765
1766 offset_skb -= skb_headlen(skb);
1767 if ((int)offset_skb < 0 || skb_has_frag_list(skb))
1768 return NULL;
1769
1770 frag = skb_shinfo(skb)->frags;
1771 while (offset_skb) {
1772 if (skb_frag_size(frag) > offset_skb) {
1773 *offset_frag = offset_skb;
1774 return frag;
1775 }
1776 offset_skb -= skb_frag_size(frag);
1777 ++frag;
1778 }
1779 *offset_frag = 0;
1780 return frag;
1781}
1782
1783static bool can_map_frag(const skb_frag_t *frag)
1784{
1785 return skb_frag_size(frag) == PAGE_SIZE && !skb_frag_off(frag);
1786}
1787
1788static int find_next_mappable_frag(const skb_frag_t *frag,
1789 int remaining_in_skb)
1790{
1791 int offset = 0;
1792
1793 if (likely(can_map_frag(frag)))
1794 return 0;
1795
1796 while (offset < remaining_in_skb && !can_map_frag(frag)) {
1797 offset += skb_frag_size(frag);
1798 ++frag;
1799 }
1800 return offset;
1801}
1802
1803static void tcp_zerocopy_set_hint_for_skb(struct sock *sk,
1804 struct tcp_zerocopy_receive *zc,
1805 struct sk_buff *skb, u32 offset)
1806{
1807 u32 frag_offset, partial_frag_remainder = 0;
1808 int mappable_offset;
1809 skb_frag_t *frag;
1810
1811
1812 zc->recv_skip_hint = skb->len - offset;
1813
1814
1815 frag = skb_advance_to_frag(skb, offset, &frag_offset);
1816 if (!frag)
1817 return;
1818
1819 if (frag_offset) {
1820 struct skb_shared_info *info = skb_shinfo(skb);
1821
1822
1823 if (frag == &info->frags[info->nr_frags - 1])
1824 return;
1825
1826
1827 partial_frag_remainder = skb_frag_size(frag) - frag_offset;
1828 zc->recv_skip_hint -= partial_frag_remainder;
1829 ++frag;
1830 }
1831
1832
1833
1834
1835
1836 mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint);
1837 zc->recv_skip_hint = mappable_offset + partial_frag_remainder;
1838}
1839
1840static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
1841 int nonblock, int flags,
1842 struct scm_timestamping_internal *tss,
1843 int *cmsg_flags);
1844static int receive_fallback_to_copy(struct sock *sk,
1845 struct tcp_zerocopy_receive *zc, int inq)
1846{
1847 unsigned long copy_address = (unsigned long)zc->copybuf_address;
1848 struct scm_timestamping_internal tss_unused;
1849 int err, cmsg_flags_unused;
1850 struct msghdr msg = {};
1851 struct iovec iov;
1852
1853 zc->length = 0;
1854 zc->recv_skip_hint = 0;
1855
1856 if (copy_address != zc->copybuf_address)
1857 return -EINVAL;
1858
1859 err = import_single_range(READ, (void __user *)copy_address,
1860 inq, &iov, &msg.msg_iter);
1861 if (err)
1862 return err;
1863
1864 err = tcp_recvmsg_locked(sk, &msg, inq, 1, 0,
1865 &tss_unused, &cmsg_flags_unused);
1866 if (err < 0)
1867 return err;
1868
1869 zc->copybuf_len = err;
1870 if (likely(zc->copybuf_len)) {
1871 struct sk_buff *skb;
1872 u32 offset;
1873
1874 skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset);
1875 if (skb)
1876 tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset);
1877 }
1878 return 0;
1879}
1880
1881static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
1882 struct sk_buff *skb, u32 copylen,
1883 u32 *offset, u32 *seq)
1884{
1885 unsigned long copy_address = (unsigned long)zc->copybuf_address;
1886 struct msghdr msg = {};
1887 struct iovec iov;
1888 int err;
1889
1890 if (copy_address != zc->copybuf_address)
1891 return -EINVAL;
1892
1893 err = import_single_range(READ, (void __user *)copy_address,
1894 copylen, &iov, &msg.msg_iter);
1895 if (err)
1896 return err;
1897 err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
1898 if (err)
1899 return err;
1900 zc->recv_skip_hint -= copylen;
1901 *offset += copylen;
1902 *seq += copylen;
1903 return (__s32)copylen;
1904}
1905
1906static int tcp_zerocopy_handle_leftover_data(struct tcp_zerocopy_receive *zc,
1907 struct sock *sk,
1908 struct sk_buff *skb,
1909 u32 *seq,
1910 s32 copybuf_len)
1911{
1912 u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
1913
1914 if (!copylen)
1915 return 0;
1916
1917 if (skb)
1918 offset = *seq - TCP_SKB_CB(skb)->seq;
1919 else
1920 skb = tcp_recv_skb(sk, *seq, &offset);
1921
1922 zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
1923 seq);
1924 return zc->copybuf_len < 0 ? 0 : copylen;
1925}
1926
1927static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
1928 struct page **pending_pages,
1929 unsigned long pages_remaining,
1930 unsigned long *address,
1931 u32 *length,
1932 u32 *seq,
1933 struct tcp_zerocopy_receive *zc,
1934 u32 total_bytes_to_map,
1935 int err)
1936{
1937
1938 if (err == -EBUSY &&
1939 zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) {
1940 u32 maybe_zap_len;
1941
1942 maybe_zap_len = total_bytes_to_map -
1943 *length +
1944 (pages_remaining * PAGE_SIZE);
1945 zap_page_range(vma, *address, maybe_zap_len);
1946 err = 0;
1947 }
1948
1949 if (!err) {
1950 unsigned long leftover_pages = pages_remaining;
1951 int bytes_mapped;
1952
1953
1954 err = vm_insert_pages(vma, *address,
1955 pending_pages,
1956 &pages_remaining);
1957 bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining);
1958 *seq += bytes_mapped;
1959 *address += bytes_mapped;
1960 }
1961 if (err) {
1962
1963
1964
1965
1966
1967 const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
1968
1969 *length -= bytes_not_mapped;
1970 zc->recv_skip_hint += bytes_not_mapped;
1971 }
1972 return err;
1973}
1974
1975static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
1976 struct page **pages,
1977 unsigned int pages_to_map,
1978 unsigned long *address,
1979 u32 *length,
1980 u32 *seq,
1981 struct tcp_zerocopy_receive *zc,
1982 u32 total_bytes_to_map)
1983{
1984 unsigned long pages_remaining = pages_to_map;
1985 unsigned int pages_mapped;
1986 unsigned int bytes_mapped;
1987 int err;
1988
1989 err = vm_insert_pages(vma, *address, pages, &pages_remaining);
1990 pages_mapped = pages_to_map - (unsigned int)pages_remaining;
1991 bytes_mapped = PAGE_SIZE * pages_mapped;
1992
1993
1994
1995 *seq += bytes_mapped;
1996 *address += bytes_mapped;
1997
1998 if (likely(!err))
1999 return 0;
2000
2001
2002 return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped,
2003 pages_remaining, address, length, seq, zc, total_bytes_to_map,
2004 err);
2005}
2006
2007#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
2008static int tcp_zerocopy_receive(struct sock *sk,
2009 struct tcp_zerocopy_receive *zc)
2010{
2011 u32 length = 0, offset, vma_len, avail_len, copylen = 0;
2012 unsigned long address = (unsigned long)zc->address;
2013 struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE];
2014 s32 copybuf_len = zc->copybuf_len;
2015 struct tcp_sock *tp = tcp_sk(sk);
2016 const skb_frag_t *frags = NULL;
2017 unsigned int pages_to_map = 0;
2018 struct vm_area_struct *vma;
2019 struct sk_buff *skb = NULL;
2020 u32 seq = tp->copied_seq;
2021 u32 total_bytes_to_map;
2022 int inq = tcp_inq(sk);
2023 int ret;
2024
2025 zc->copybuf_len = 0;
2026
2027 if (address & (PAGE_SIZE - 1) || address != zc->address)
2028 return -EINVAL;
2029
2030 if (sk->sk_state == TCP_LISTEN)
2031 return -ENOTCONN;
2032
2033 sock_rps_record_flow(sk);
2034
2035 if (inq && inq <= copybuf_len)
2036 return receive_fallback_to_copy(sk, zc, inq);
2037
2038 if (inq < PAGE_SIZE) {
2039 zc->length = 0;
2040 zc->recv_skip_hint = inq;
2041 if (!inq && sock_flag(sk, SOCK_DONE))
2042 return -EIO;
2043 return 0;
2044 }
2045
2046 mmap_read_lock(current->mm);
2047
2048 vma = find_vma(current->mm, address);
2049 if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) {
2050 mmap_read_unlock(current->mm);
2051 return -EINVAL;
2052 }
2053 vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
2054 avail_len = min_t(u32, vma_len, inq);
2055 total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1);
2056 if (total_bytes_to_map) {
2057 if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT))
2058 zap_page_range(vma, address, total_bytes_to_map);
2059 zc->length = total_bytes_to_map;
2060 zc->recv_skip_hint = 0;
2061 } else {
2062 zc->length = avail_len;
2063 zc->recv_skip_hint = avail_len;
2064 }
2065 ret = 0;
2066 while (length + PAGE_SIZE <= zc->length) {
2067 int mappable_offset;
2068 struct page *page;
2069
2070 if (zc->recv_skip_hint < PAGE_SIZE) {
2071 u32 offset_frag;
2072
2073 if (skb) {
2074 if (zc->recv_skip_hint > 0)
2075 break;
2076 skb = skb->next;
2077 offset = seq - TCP_SKB_CB(skb)->seq;
2078 } else {
2079 skb = tcp_recv_skb(sk, seq, &offset);
2080 }
2081 zc->recv_skip_hint = skb->len - offset;
2082 frags = skb_advance_to_frag(skb, offset, &offset_frag);
2083 if (!frags || offset_frag)
2084 break;
2085 }
2086
2087 mappable_offset = find_next_mappable_frag(frags,
2088 zc->recv_skip_hint);
2089 if (mappable_offset) {
2090 zc->recv_skip_hint = mappable_offset;
2091 break;
2092 }
2093 page = skb_frag_page(frags);
2094 prefetchw(page);
2095 pages[pages_to_map++] = page;
2096 length += PAGE_SIZE;
2097 zc->recv_skip_hint -= PAGE_SIZE;
2098 frags++;
2099 if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE ||
2100 zc->recv_skip_hint < PAGE_SIZE) {
2101
2102
2103
2104 ret = tcp_zerocopy_vm_insert_batch(vma, pages,
2105 pages_to_map,
2106 &address, &length,
2107 &seq, zc,
2108 total_bytes_to_map);
2109 if (ret)
2110 goto out;
2111 pages_to_map = 0;
2112 }
2113 }
2114 if (pages_to_map) {
2115 ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map,
2116 &address, &length, &seq,
2117 zc, total_bytes_to_map);
2118 }
2119out:
2120 mmap_read_unlock(current->mm);
2121
2122 if (!ret)
2123 copylen = tcp_zerocopy_handle_leftover_data(zc, sk, skb, &seq,
2124 copybuf_len);
2125
2126 if (length + copylen) {
2127 WRITE_ONCE(tp->copied_seq, seq);
2128 tcp_rcv_space_adjust(sk);
2129
2130
2131 tcp_recv_skb(sk, seq, &offset);
2132 tcp_cleanup_rbuf(sk, length + copylen);
2133 ret = 0;
2134 if (length == zc->length)
2135 zc->recv_skip_hint = 0;
2136 } else {
2137 if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
2138 ret = -EIO;
2139 }
2140 zc->length = length;
2141 return ret;
2142}
2143#endif
2144
2145static void tcp_update_recv_tstamps(struct sk_buff *skb,
2146 struct scm_timestamping_internal *tss)
2147{
2148 if (skb->tstamp)
2149 tss->ts[0] = ktime_to_timespec64(skb->tstamp);
2150 else
2151 tss->ts[0] = (struct timespec64) {0};
2152
2153 if (skb_hwtstamps(skb)->hwtstamp)
2154 tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
2155 else
2156 tss->ts[2] = (struct timespec64) {0};
2157}
2158
2159
2160static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
2161 struct scm_timestamping_internal *tss)
2162{
2163 int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
2164 bool has_timestamping = false;
2165
2166 if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
2167 if (sock_flag(sk, SOCK_RCVTSTAMP)) {
2168 if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
2169 if (new_tstamp) {
2170 struct __kernel_timespec kts = {
2171 .tv_sec = tss->ts[0].tv_sec,
2172 .tv_nsec = tss->ts[0].tv_nsec,
2173 };
2174 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
2175 sizeof(kts), &kts);
2176 } else {
2177 struct __kernel_old_timespec ts_old = {
2178 .tv_sec = tss->ts[0].tv_sec,
2179 .tv_nsec = tss->ts[0].tv_nsec,
2180 };
2181 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
2182 sizeof(ts_old), &ts_old);
2183 }
2184 } else {
2185 if (new_tstamp) {
2186 struct __kernel_sock_timeval stv = {
2187 .tv_sec = tss->ts[0].tv_sec,
2188 .tv_usec = tss->ts[0].tv_nsec / 1000,
2189 };
2190 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
2191 sizeof(stv), &stv);
2192 } else {
2193 struct __kernel_old_timeval tv = {
2194 .tv_sec = tss->ts[0].tv_sec,
2195 .tv_usec = tss->ts[0].tv_nsec / 1000,
2196 };
2197 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
2198 sizeof(tv), &tv);
2199 }
2200 }
2201 }
2202
2203 if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
2204 has_timestamping = true;
2205 else
2206 tss->ts[0] = (struct timespec64) {0};
2207 }
2208
2209 if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
2210 if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
2211 has_timestamping = true;
2212 else
2213 tss->ts[2] = (struct timespec64) {0};
2214 }
2215
2216 if (has_timestamping) {
2217 tss->ts[1] = (struct timespec64) {0};
2218 if (sock_flag(sk, SOCK_TSTAMP_NEW))
2219 put_cmsg_scm_timestamping64(msg, tss);
2220 else
2221 put_cmsg_scm_timestamping(msg, tss);
2222 }
2223}
2224
2225static int tcp_inq_hint(struct sock *sk)
2226{
2227 const struct tcp_sock *tp = tcp_sk(sk);
2228 u32 copied_seq = READ_ONCE(tp->copied_seq);
2229 u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
2230 int inq;
2231
2232 inq = rcv_nxt - copied_seq;
2233 if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
2234 lock_sock(sk);
2235 inq = tp->rcv_nxt - tp->copied_seq;
2236 release_sock(sk);
2237 }
2238
2239
2240
2241 if (inq == 0 && sock_flag(sk, SOCK_DONE))
2242 inq = 1;
2243 return inq;
2244}
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
2255 int nonblock, int flags,
2256 struct scm_timestamping_internal *tss,
2257 int *cmsg_flags)
2258{
2259 struct tcp_sock *tp = tcp_sk(sk);
2260 int copied = 0;
2261 u32 peek_seq;
2262 u32 *seq;
2263 unsigned long used;
2264 int err;
2265 int target;
2266 long timeo;
2267 struct sk_buff *skb, *last;
2268 u32 urg_hole = 0;
2269
2270 err = -ENOTCONN;
2271 if (sk->sk_state == TCP_LISTEN)
2272 goto out;
2273
2274 if (tp->recvmsg_inq)
2275 *cmsg_flags = 1;
2276 timeo = sock_rcvtimeo(sk, nonblock);
2277
2278
2279 if (flags & MSG_OOB)
2280 goto recv_urg;
2281
2282 if (unlikely(tp->repair)) {
2283 err = -EPERM;
2284 if (!(flags & MSG_PEEK))
2285 goto out;
2286
2287 if (tp->repair_queue == TCP_SEND_QUEUE)
2288 goto recv_sndq;
2289
2290 err = -EINVAL;
2291 if (tp->repair_queue == TCP_NO_QUEUE)
2292 goto out;
2293
2294
2295 }
2296
2297 seq = &tp->copied_seq;
2298 if (flags & MSG_PEEK) {
2299 peek_seq = tp->copied_seq;
2300 seq = &peek_seq;
2301 }
2302
2303 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
2304
2305 do {
2306 u32 offset;
2307
2308
2309 if (tp->urg_data && tp->urg_seq == *seq) {
2310 if (copied)
2311 break;
2312 if (signal_pending(current)) {
2313 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
2314 break;
2315 }
2316 }
2317
2318
2319
2320 last = skb_peek_tail(&sk->sk_receive_queue);
2321 skb_queue_walk(&sk->sk_receive_queue, skb) {
2322 last = skb;
2323
2324
2325
2326 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
2327 "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
2328 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
2329 flags))
2330 break;
2331
2332 offset = *seq - TCP_SKB_CB(skb)->seq;
2333 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
2334 pr_err_once("%s: found a SYN, please report !\n", __func__);
2335 offset--;
2336 }
2337 if (offset < skb->len)
2338 goto found_ok_skb;
2339 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2340 goto found_fin_ok;
2341 WARN(!(flags & MSG_PEEK),
2342 "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
2343 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
2344 }
2345
2346
2347
2348 if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
2349 break;
2350
2351 if (copied) {
2352 if (sk->sk_err ||
2353 sk->sk_state == TCP_CLOSE ||
2354 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2355 !timeo ||
2356 signal_pending(current))
2357 break;
2358 } else {
2359 if (sock_flag(sk, SOCK_DONE))
2360 break;
2361
2362 if (sk->sk_err) {
2363 copied = sock_error(sk);
2364 break;
2365 }
2366
2367 if (sk->sk_shutdown & RCV_SHUTDOWN)
2368 break;
2369
2370 if (sk->sk_state == TCP_CLOSE) {
2371
2372
2373
2374 copied = -ENOTCONN;
2375 break;
2376 }
2377
2378 if (!timeo) {
2379 copied = -EAGAIN;
2380 break;
2381 }
2382
2383 if (signal_pending(current)) {
2384 copied = sock_intr_errno(timeo);
2385 break;
2386 }
2387 }
2388
2389 tcp_cleanup_rbuf(sk, copied);
2390
2391 if (copied >= target) {
2392
2393 release_sock(sk);
2394 lock_sock(sk);
2395 } else {
2396 sk_wait_data(sk, &timeo, last);
2397 }
2398
2399 if ((flags & MSG_PEEK) &&
2400 (peek_seq - copied - urg_hole != tp->copied_seq)) {
2401 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
2402 current->comm,
2403 task_pid_nr(current));
2404 peek_seq = tp->copied_seq;
2405 }
2406 continue;
2407
2408found_ok_skb:
2409
2410 used = skb->len - offset;
2411 if (len < used)
2412 used = len;
2413
2414
2415 if (tp->urg_data) {
2416 u32 urg_offset = tp->urg_seq - *seq;
2417 if (urg_offset < used) {
2418 if (!urg_offset) {
2419 if (!sock_flag(sk, SOCK_URGINLINE)) {
2420 WRITE_ONCE(*seq, *seq + 1);
2421 urg_hole++;
2422 offset++;
2423 used--;
2424 if (!used)
2425 goto skip_copy;
2426 }
2427 } else
2428 used = urg_offset;
2429 }
2430 }
2431
2432 if (!(flags & MSG_TRUNC)) {
2433 err = skb_copy_datagram_msg(skb, offset, msg, used);
2434 if (err) {
2435
2436 if (!copied)
2437 copied = -EFAULT;
2438 break;
2439 }
2440 }
2441
2442 WRITE_ONCE(*seq, *seq + used);
2443 copied += used;
2444 len -= used;
2445
2446 tcp_rcv_space_adjust(sk);
2447
2448skip_copy:
2449 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
2450 tp->urg_data = 0;
2451 tcp_fast_path_check(sk);
2452 }
2453
2454 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2455 tcp_update_recv_tstamps(skb, tss);
2456 *cmsg_flags |= 2;
2457 }
2458
2459 if (used + offset < skb->len)
2460 continue;
2461
2462 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2463 goto found_fin_ok;
2464 if (!(flags & MSG_PEEK))
2465 sk_eat_skb(sk, skb);
2466 continue;
2467
2468found_fin_ok:
2469
2470 WRITE_ONCE(*seq, *seq + 1);
2471 if (!(flags & MSG_PEEK))
2472 sk_eat_skb(sk, skb);
2473 break;
2474 } while (len > 0);
2475
2476
2477
2478
2479
2480
2481 tcp_cleanup_rbuf(sk, copied);
2482 return copied;
2483
2484out:
2485 return err;
2486
2487recv_urg:
2488 err = tcp_recv_urg(sk, msg, len, flags);
2489 goto out;
2490
2491recv_sndq:
2492 err = tcp_peek_sndq(sk, msg, len);
2493 goto out;
2494}
2495
2496int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
2497 int flags, int *addr_len)
2498{
2499 int cmsg_flags = 0, ret, inq;
2500 struct scm_timestamping_internal tss;
2501
2502 if (unlikely(flags & MSG_ERRQUEUE))
2503 return inet_recv_error(sk, msg, len, addr_len);
2504
2505 if (sk_can_busy_loop(sk) &&
2506 skb_queue_empty_lockless(&sk->sk_receive_queue) &&
2507 sk->sk_state == TCP_ESTABLISHED)
2508 sk_busy_loop(sk, nonblock);
2509
2510 lock_sock(sk);
2511 ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss,
2512 &cmsg_flags);
2513 release_sock(sk);
2514
2515 if (cmsg_flags && ret >= 0) {
2516 if (cmsg_flags & 2)
2517 tcp_recv_timestamp(msg, sk, &tss);
2518 if (cmsg_flags & 1) {
2519 inq = tcp_inq_hint(sk);
2520 put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
2521 }
2522 }
2523 return ret;
2524}
2525EXPORT_SYMBOL(tcp_recvmsg);
2526
2527void tcp_set_state(struct sock *sk, int state)
2528{
2529 int oldstate = sk->sk_state;
2530
2531
2532
2533
2534
2535
2536
2537
2538 BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
2539 BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
2540 BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
2541 BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
2542 BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
2543 BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
2544 BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
2545 BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
2546 BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
2547 BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
2548 BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
2549 BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
2550 BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
2551
2552 if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
2553 tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
2554
2555 switch (state) {
2556 case TCP_ESTABLISHED:
2557 if (oldstate != TCP_ESTABLISHED)
2558 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2559 break;
2560
2561 case TCP_CLOSE:
2562 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
2563 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
2564
2565 sk->sk_prot->unhash(sk);
2566 if (inet_csk(sk)->icsk_bind_hash &&
2567 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
2568 inet_put_port(sk);
2569 fallthrough;
2570 default:
2571 if (oldstate == TCP_ESTABLISHED)
2572 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2573 }
2574
2575
2576
2577
2578 inet_sk_state_store(sk, state);
2579}
2580EXPORT_SYMBOL_GPL(tcp_set_state);
2581
2582
2583
2584
2585
2586
2587
2588
2589static const unsigned char new_state[16] = {
2590
2591 [0 ] = TCP_CLOSE,
2592 [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2593 [TCP_SYN_SENT] = TCP_CLOSE,
2594 [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2595 [TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
2596 [TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
2597 [TCP_TIME_WAIT] = TCP_CLOSE,
2598 [TCP_CLOSE] = TCP_CLOSE,
2599 [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
2600 [TCP_LAST_ACK] = TCP_LAST_ACK,
2601 [TCP_LISTEN] = TCP_CLOSE,
2602 [TCP_CLOSING] = TCP_CLOSING,
2603 [TCP_NEW_SYN_RECV] = TCP_CLOSE,
2604};
2605
2606static int tcp_close_state(struct sock *sk)
2607{
2608 int next = (int)new_state[sk->sk_state];
2609 int ns = next & TCP_STATE_MASK;
2610
2611 tcp_set_state(sk, ns);
2612
2613 return next & TCP_ACTION_FIN;
2614}
2615
2616
2617
2618
2619
2620
2621void tcp_shutdown(struct sock *sk, int how)
2622{
2623
2624
2625
2626
2627 if (!(how & SEND_SHUTDOWN))
2628 return;
2629
2630
2631 if ((1 << sk->sk_state) &
2632 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2633 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2634
2635 if (tcp_close_state(sk))
2636 tcp_send_fin(sk);
2637 }
2638}
2639EXPORT_SYMBOL(tcp_shutdown);
2640
2641bool tcp_check_oom(struct sock *sk, int shift)
2642{
2643 bool too_many_orphans, out_of_socket_memory;
2644
2645 too_many_orphans = tcp_too_many_orphans(sk, shift);
2646 out_of_socket_memory = tcp_out_of_memory(sk);
2647
2648 if (too_many_orphans)
2649 net_info_ratelimited("too many orphaned sockets\n");
2650 if (out_of_socket_memory)
2651 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2652 return too_many_orphans || out_of_socket_memory;
2653}
2654
2655void __tcp_close(struct sock *sk, long timeout)
2656{
2657 struct sk_buff *skb;
2658 int data_was_unread = 0;
2659 int state;
2660
2661 sk->sk_shutdown = SHUTDOWN_MASK;
2662
2663 if (sk->sk_state == TCP_LISTEN) {
2664 tcp_set_state(sk, TCP_CLOSE);
2665
2666
2667 inet_csk_listen_stop(sk);
2668
2669 goto adjudge_to_death;
2670 }
2671
2672
2673
2674
2675
2676 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2677 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
2678
2679 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2680 len--;
2681 data_was_unread += len;
2682 __kfree_skb(skb);
2683 }
2684
2685 sk_mem_reclaim(sk);
2686
2687
2688 if (sk->sk_state == TCP_CLOSE)
2689 goto adjudge_to_death;
2690
2691
2692
2693
2694
2695
2696
2697
2698 if (unlikely(tcp_sk(sk)->repair)) {
2699 sk->sk_prot->disconnect(sk, 0);
2700 } else if (data_was_unread) {
2701
2702 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2703 tcp_set_state(sk, TCP_CLOSE);
2704 tcp_send_active_reset(sk, sk->sk_allocation);
2705 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2706
2707 sk->sk_prot->disconnect(sk, 0);
2708 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2709 } else if (tcp_close_state(sk)) {
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739 tcp_send_fin(sk);
2740 }
2741
2742 sk_stream_wait_close(sk, timeout);
2743
2744adjudge_to_death:
2745 state = sk->sk_state;
2746 sock_hold(sk);
2747 sock_orphan(sk);
2748
2749 local_bh_disable();
2750 bh_lock_sock(sk);
2751
2752 __release_sock(sk);
2753
2754 percpu_counter_inc(sk->sk_prot->orphan_count);
2755
2756
2757 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2758 goto out;
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774 if (sk->sk_state == TCP_FIN_WAIT2) {
2775 struct tcp_sock *tp = tcp_sk(sk);
2776 if (tp->linger2 < 0) {
2777 tcp_set_state(sk, TCP_CLOSE);
2778 tcp_send_active_reset(sk, GFP_ATOMIC);
2779 __NET_INC_STATS(sock_net(sk),
2780 LINUX_MIB_TCPABORTONLINGER);
2781 } else {
2782 const int tmo = tcp_fin_time(sk);
2783
2784 if (tmo > TCP_TIMEWAIT_LEN) {
2785 inet_csk_reset_keepalive_timer(sk,
2786 tmo - TCP_TIMEWAIT_LEN);
2787 } else {
2788 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2789 goto out;
2790 }
2791 }
2792 }
2793 if (sk->sk_state != TCP_CLOSE) {
2794 sk_mem_reclaim(sk);
2795 if (tcp_check_oom(sk, 0)) {
2796 tcp_set_state(sk, TCP_CLOSE);
2797 tcp_send_active_reset(sk, GFP_ATOMIC);
2798 __NET_INC_STATS(sock_net(sk),
2799 LINUX_MIB_TCPABORTONMEMORY);
2800 } else if (!check_net(sock_net(sk))) {
2801
2802 tcp_set_state(sk, TCP_CLOSE);
2803 }
2804 }
2805
2806 if (sk->sk_state == TCP_CLOSE) {
2807 struct request_sock *req;
2808
2809 req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
2810 lockdep_sock_is_held(sk));
2811
2812
2813
2814
2815 if (req)
2816 reqsk_fastopen_remove(sk, req, false);
2817 inet_csk_destroy_sock(sk);
2818 }
2819
2820
2821out:
2822 bh_unlock_sock(sk);
2823 local_bh_enable();
2824}
2825
2826void tcp_close(struct sock *sk, long timeout)
2827{
2828 lock_sock(sk);
2829 __tcp_close(sk, timeout);
2830 release_sock(sk);
2831 sock_put(sk);
2832}
2833EXPORT_SYMBOL(tcp_close);
2834
2835
2836
2837static inline bool tcp_need_reset(int state)
2838{
2839 return (1 << state) &
2840 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2841 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2842}
2843
2844static void tcp_rtx_queue_purge(struct sock *sk)
2845{
2846 struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
2847
2848 tcp_sk(sk)->highest_sack = NULL;
2849 while (p) {
2850 struct sk_buff *skb = rb_to_skb(p);
2851
2852 p = rb_next(p);
2853
2854
2855
2856 tcp_rtx_queue_unlink(skb, sk);
2857 sk_wmem_free_skb(sk, skb);
2858 }
2859}
2860
2861void tcp_write_queue_purge(struct sock *sk)
2862{
2863 struct sk_buff *skb;
2864
2865 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
2866 while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
2867 tcp_skb_tsorted_anchor_cleanup(skb);
2868 sk_wmem_free_skb(sk, skb);
2869 }
2870 tcp_rtx_queue_purge(sk);
2871 skb = sk->sk_tx_skb_cache;
2872 if (skb) {
2873 __kfree_skb(skb);
2874 sk->sk_tx_skb_cache = NULL;
2875 }
2876 INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
2877 sk_mem_reclaim(sk);
2878 tcp_clear_all_retrans_hints(tcp_sk(sk));
2879 tcp_sk(sk)->packets_out = 0;
2880 inet_csk(sk)->icsk_backoff = 0;
2881}
2882
2883int tcp_disconnect(struct sock *sk, int flags)
2884{
2885 struct inet_sock *inet = inet_sk(sk);
2886 struct inet_connection_sock *icsk = inet_csk(sk);
2887 struct tcp_sock *tp = tcp_sk(sk);
2888 int old_state = sk->sk_state;
2889 u32 seq;
2890
2891 if (old_state != TCP_CLOSE)
2892 tcp_set_state(sk, TCP_CLOSE);
2893
2894
2895 if (old_state == TCP_LISTEN) {
2896 inet_csk_listen_stop(sk);
2897 } else if (unlikely(tp->repair)) {
2898 sk->sk_err = ECONNABORTED;
2899 } else if (tcp_need_reset(old_state) ||
2900 (tp->snd_nxt != tp->write_seq &&
2901 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2902
2903
2904
2905 tcp_send_active_reset(sk, gfp_any());
2906 sk->sk_err = ECONNRESET;
2907 } else if (old_state == TCP_SYN_SENT)
2908 sk->sk_err = ECONNRESET;
2909
2910 tcp_clear_xmit_timers(sk);
2911 __skb_queue_purge(&sk->sk_receive_queue);
2912 if (sk->sk_rx_skb_cache) {
2913 __kfree_skb(sk->sk_rx_skb_cache);
2914 sk->sk_rx_skb_cache = NULL;
2915 }
2916 WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
2917 tp->urg_data = 0;
2918 tcp_write_queue_purge(sk);
2919 tcp_fastopen_active_disable_ofo_check(sk);
2920 skb_rbtree_purge(&tp->out_of_order_queue);
2921
2922 inet->inet_dport = 0;
2923
2924 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2925 inet_reset_saddr(sk);
2926
2927 sk->sk_shutdown = 0;
2928 sock_reset_flag(sk, SOCK_DONE);
2929 tp->srtt_us = 0;
2930 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
2931 tp->rcv_rtt_last_tsecr = 0;
2932
2933 seq = tp->write_seq + tp->max_window + 2;
2934 if (!seq)
2935 seq = 1;
2936 WRITE_ONCE(tp->write_seq, seq);
2937
2938 icsk->icsk_backoff = 0;
2939 icsk->icsk_probes_out = 0;
2940 icsk->icsk_probes_tstamp = 0;
2941 icsk->icsk_rto = TCP_TIMEOUT_INIT;
2942 icsk->icsk_rto_min = TCP_RTO_MIN;
2943 icsk->icsk_delack_max = TCP_DELACK_MAX;
2944 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2945 tp->snd_cwnd = TCP_INIT_CWND;
2946 tp->snd_cwnd_cnt = 0;
2947 tp->window_clamp = 0;
2948 tp->delivered = 0;
2949 tp->delivered_ce = 0;
2950 if (icsk->icsk_ca_ops->release)
2951 icsk->icsk_ca_ops->release(sk);
2952 memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
2953 icsk->icsk_ca_initialized = 0;
2954 tcp_set_ca_state(sk, TCP_CA_Open);
2955 tp->is_sack_reneg = 0;
2956 tcp_clear_retrans(tp);
2957 tp->total_retrans = 0;
2958 inet_csk_delack_init(sk);
2959
2960
2961
2962 icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
2963 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2964 __sk_dst_reset(sk);
2965 dst_release(sk->sk_rx_dst);
2966 sk->sk_rx_dst = NULL;
2967 tcp_saved_syn_free(tp);
2968 tp->compressed_ack = 0;
2969 tp->segs_in = 0;
2970 tp->segs_out = 0;
2971 tp->bytes_sent = 0;
2972 tp->bytes_acked = 0;
2973 tp->bytes_received = 0;
2974 tp->bytes_retrans = 0;
2975 tp->data_segs_in = 0;
2976 tp->data_segs_out = 0;
2977 tp->duplicate_sack[0].start_seq = 0;
2978 tp->duplicate_sack[0].end_seq = 0;
2979 tp->dsack_dups = 0;
2980 tp->reord_seen = 0;
2981 tp->retrans_out = 0;
2982 tp->sacked_out = 0;
2983 tp->tlp_high_seq = 0;
2984 tp->last_oow_ack_time = 0;
2985
2986 tp->app_limited = ~0U;
2987 tp->rack.mstamp = 0;
2988 tp->rack.advanced = 0;
2989 tp->rack.reo_wnd_steps = 1;
2990 tp->rack.last_delivered = 0;
2991 tp->rack.reo_wnd_persist = 0;
2992 tp->rack.dsack_seen = 0;
2993 tp->syn_data_acked = 0;
2994 tp->rx_opt.saw_tstamp = 0;
2995 tp->rx_opt.dsack = 0;
2996 tp->rx_opt.num_sacks = 0;
2997 tp->rcv_ooopack = 0;
2998
2999
3000
3001 tcp_free_fastopen_req(tp);
3002 inet->defer_connect = 0;
3003 tp->fastopen_client_fail = 0;
3004
3005 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
3006
3007 if (sk->sk_frag.page) {
3008 put_page(sk->sk_frag.page);
3009 sk->sk_frag.page = NULL;
3010 sk->sk_frag.offset = 0;
3011 }
3012
3013 sk->sk_error_report(sk);
3014 return 0;
3015}
3016EXPORT_SYMBOL(tcp_disconnect);
3017
3018static inline bool tcp_can_repair_sock(const struct sock *sk)
3019{
3020 return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
3021 (sk->sk_state != TCP_LISTEN);
3022}
3023
3024static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
3025{
3026 struct tcp_repair_window opt;
3027
3028 if (!tp->repair)
3029 return -EPERM;
3030
3031 if (len != sizeof(opt))
3032 return -EINVAL;
3033
3034 if (copy_from_sockptr(&opt, optbuf, sizeof(opt)))
3035 return -EFAULT;
3036
3037 if (opt.max_window < opt.snd_wnd)
3038 return -EINVAL;
3039
3040 if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
3041 return -EINVAL;
3042
3043 if (after(opt.rcv_wup, tp->rcv_nxt))
3044 return -EINVAL;
3045
3046 tp->snd_wl1 = opt.snd_wl1;
3047 tp->snd_wnd = opt.snd_wnd;
3048 tp->max_window = opt.max_window;
3049
3050 tp->rcv_wnd = opt.rcv_wnd;
3051 tp->rcv_wup = opt.rcv_wup;
3052
3053 return 0;
3054}
3055
3056static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
3057 unsigned int len)
3058{
3059 struct tcp_sock *tp = tcp_sk(sk);
3060 struct tcp_repair_opt opt;
3061 size_t offset = 0;
3062
3063 while (len >= sizeof(opt)) {
3064 if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt)))
3065 return -EFAULT;
3066
3067 offset += sizeof(opt);
3068 len -= sizeof(opt);
3069
3070 switch (opt.opt_code) {
3071 case TCPOPT_MSS:
3072 tp->rx_opt.mss_clamp = opt.opt_val;
3073 tcp_mtup_init(sk);
3074 break;
3075 case TCPOPT_WINDOW:
3076 {
3077 u16 snd_wscale = opt.opt_val & 0xFFFF;
3078 u16 rcv_wscale = opt.opt_val >> 16;
3079
3080 if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE)
3081 return -EFBIG;
3082
3083 tp->rx_opt.snd_wscale = snd_wscale;
3084 tp->rx_opt.rcv_wscale = rcv_wscale;
3085 tp->rx_opt.wscale_ok = 1;
3086 }
3087 break;
3088 case TCPOPT_SACK_PERM:
3089 if (opt.opt_val != 0)
3090 return -EINVAL;
3091
3092 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
3093 break;
3094 case TCPOPT_TIMESTAMP:
3095 if (opt.opt_val != 0)
3096 return -EINVAL;
3097
3098 tp->rx_opt.tstamp_ok = 1;
3099 break;
3100 }
3101 }
3102
3103 return 0;
3104}
3105
3106DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
3107EXPORT_SYMBOL(tcp_tx_delay_enabled);
3108
3109static void tcp_enable_tx_delay(void)
3110{
3111 if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
3112 static int __tcp_tx_delay_enabled = 0;
3113
3114 if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
3115 static_branch_enable(&tcp_tx_delay_enabled);
3116 pr_info("TCP_TX_DELAY enabled\n");
3117 }
3118 }
3119}
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130static void __tcp_sock_set_cork(struct sock *sk, bool on)
3131{
3132 struct tcp_sock *tp = tcp_sk(sk);
3133
3134 if (on) {
3135 tp->nonagle |= TCP_NAGLE_CORK;
3136 } else {
3137 tp->nonagle &= ~TCP_NAGLE_CORK;
3138 if (tp->nonagle & TCP_NAGLE_OFF)
3139 tp->nonagle |= TCP_NAGLE_PUSH;
3140 tcp_push_pending_frames(sk);
3141 }
3142}
3143
3144void tcp_sock_set_cork(struct sock *sk, bool on)
3145{
3146 lock_sock(sk);
3147 __tcp_sock_set_cork(sk, on);
3148 release_sock(sk);
3149}
3150EXPORT_SYMBOL(tcp_sock_set_cork);
3151
3152
3153
3154
3155
3156
3157
3158static void __tcp_sock_set_nodelay(struct sock *sk, bool on)
3159{
3160 if (on) {
3161 tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
3162 tcp_push_pending_frames(sk);
3163 } else {
3164 tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF;
3165 }
3166}
3167
3168void tcp_sock_set_nodelay(struct sock *sk)
3169{
3170 lock_sock(sk);
3171 __tcp_sock_set_nodelay(sk, true);
3172 release_sock(sk);
3173}
3174EXPORT_SYMBOL(tcp_sock_set_nodelay);
3175
3176static void __tcp_sock_set_quickack(struct sock *sk, int val)
3177{
3178 if (!val) {
3179 inet_csk_enter_pingpong_mode(sk);
3180 return;
3181 }
3182
3183 inet_csk_exit_pingpong_mode(sk);
3184 if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
3185 inet_csk_ack_scheduled(sk)) {
3186 inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED;
3187 tcp_cleanup_rbuf(sk, 1);
3188 if (!(val & 1))
3189 inet_csk_enter_pingpong_mode(sk);
3190 }
3191}
3192
3193void tcp_sock_set_quickack(struct sock *sk, int val)
3194{
3195 lock_sock(sk);
3196 __tcp_sock_set_quickack(sk, val);
3197 release_sock(sk);
3198}
3199EXPORT_SYMBOL(tcp_sock_set_quickack);
3200
3201int tcp_sock_set_syncnt(struct sock *sk, int val)
3202{
3203 if (val < 1 || val > MAX_TCP_SYNCNT)
3204 return -EINVAL;
3205
3206 lock_sock(sk);
3207 inet_csk(sk)->icsk_syn_retries = val;
3208 release_sock(sk);
3209 return 0;
3210}
3211EXPORT_SYMBOL(tcp_sock_set_syncnt);
3212
3213void tcp_sock_set_user_timeout(struct sock *sk, u32 val)
3214{
3215 lock_sock(sk);
3216 inet_csk(sk)->icsk_user_timeout = val;
3217 release_sock(sk);
3218}
3219EXPORT_SYMBOL(tcp_sock_set_user_timeout);
3220
3221int tcp_sock_set_keepidle_locked(struct sock *sk, int val)
3222{
3223 struct tcp_sock *tp = tcp_sk(sk);
3224
3225 if (val < 1 || val > MAX_TCP_KEEPIDLE)
3226 return -EINVAL;
3227
3228 tp->keepalive_time = val * HZ;
3229 if (sock_flag(sk, SOCK_KEEPOPEN) &&
3230 !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
3231 u32 elapsed = keepalive_time_elapsed(tp);
3232
3233 if (tp->keepalive_time > elapsed)
3234 elapsed = tp->keepalive_time - elapsed;
3235 else
3236 elapsed = 0;
3237 inet_csk_reset_keepalive_timer(sk, elapsed);
3238 }
3239
3240 return 0;
3241}
3242
3243int tcp_sock_set_keepidle(struct sock *sk, int val)
3244{
3245 int err;
3246
3247 lock_sock(sk);
3248 err = tcp_sock_set_keepidle_locked(sk, val);
3249 release_sock(sk);
3250 return err;
3251}
3252EXPORT_SYMBOL(tcp_sock_set_keepidle);
3253
3254int tcp_sock_set_keepintvl(struct sock *sk, int val)
3255{
3256 if (val < 1 || val > MAX_TCP_KEEPINTVL)
3257 return -EINVAL;
3258
3259 lock_sock(sk);
3260 tcp_sk(sk)->keepalive_intvl = val * HZ;
3261 release_sock(sk);
3262 return 0;
3263}
3264EXPORT_SYMBOL(tcp_sock_set_keepintvl);
3265
3266int tcp_sock_set_keepcnt(struct sock *sk, int val)
3267{
3268 if (val < 1 || val > MAX_TCP_KEEPCNT)
3269 return -EINVAL;
3270
3271 lock_sock(sk);
3272 tcp_sk(sk)->keepalive_probes = val;
3273 release_sock(sk);
3274 return 0;
3275}
3276EXPORT_SYMBOL(tcp_sock_set_keepcnt);
3277
3278int tcp_set_window_clamp(struct sock *sk, int val)
3279{
3280 struct tcp_sock *tp = tcp_sk(sk);
3281
3282 if (!val) {
3283 if (sk->sk_state != TCP_CLOSE)
3284 return -EINVAL;
3285 tp->window_clamp = 0;
3286 } else {
3287 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
3288 SOCK_MIN_RCVBUF / 2 : val;
3289 }
3290 return 0;
3291}
3292
3293
3294
3295
3296static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
3297 sockptr_t optval, unsigned int optlen)
3298{
3299 struct tcp_sock *tp = tcp_sk(sk);
3300 struct inet_connection_sock *icsk = inet_csk(sk);
3301 struct net *net = sock_net(sk);
3302 int val;
3303 int err = 0;
3304
3305
3306 switch (optname) {
3307 case TCP_CONGESTION: {
3308 char name[TCP_CA_NAME_MAX];
3309
3310 if (optlen < 1)
3311 return -EINVAL;
3312
3313 val = strncpy_from_sockptr(name, optval,
3314 min_t(long, TCP_CA_NAME_MAX-1, optlen));
3315 if (val < 0)
3316 return -EFAULT;
3317 name[val] = 0;
3318
3319 lock_sock(sk);
3320 err = tcp_set_congestion_control(sk, name, true,
3321 ns_capable(sock_net(sk)->user_ns,
3322 CAP_NET_ADMIN));
3323 release_sock(sk);
3324 return err;
3325 }
3326 case TCP_ULP: {
3327 char name[TCP_ULP_NAME_MAX];
3328
3329 if (optlen < 1)
3330 return -EINVAL;
3331
3332 val = strncpy_from_sockptr(name, optval,
3333 min_t(long, TCP_ULP_NAME_MAX - 1,
3334 optlen));
3335 if (val < 0)
3336 return -EFAULT;
3337 name[val] = 0;
3338
3339 lock_sock(sk);
3340 err = tcp_set_ulp(sk, name);
3341 release_sock(sk);
3342 return err;
3343 }
3344 case TCP_FASTOPEN_KEY: {
3345 __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
3346 __u8 *backup_key = NULL;
3347
3348
3349
3350
3351 if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
3352 optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
3353 return -EINVAL;
3354
3355 if (copy_from_sockptr(key, optval, optlen))
3356 return -EFAULT;
3357
3358 if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
3359 backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
3360
3361 return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
3362 }
3363 default:
3364
3365 break;
3366 }
3367
3368 if (optlen < sizeof(int))
3369 return -EINVAL;
3370
3371 if (copy_from_sockptr(&val, optval, sizeof(val)))
3372 return -EFAULT;
3373
3374 lock_sock(sk);
3375
3376 switch (optname) {
3377 case TCP_MAXSEG:
3378
3379
3380
3381
3382 if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) {
3383 err = -EINVAL;
3384 break;
3385 }
3386 tp->rx_opt.user_mss = val;
3387 break;
3388
3389 case TCP_NODELAY:
3390 __tcp_sock_set_nodelay(sk, val);
3391 break;
3392
3393 case TCP_THIN_LINEAR_TIMEOUTS:
3394 if (val < 0 || val > 1)
3395 err = -EINVAL;
3396 else
3397 tp->thin_lto = val;
3398 break;
3399
3400 case TCP_THIN_DUPACK:
3401 if (val < 0 || val > 1)
3402 err = -EINVAL;
3403 break;
3404
3405 case TCP_REPAIR:
3406 if (!tcp_can_repair_sock(sk))
3407 err = -EPERM;
3408 else if (val == TCP_REPAIR_ON) {
3409 tp->repair = 1;
3410 sk->sk_reuse = SK_FORCE_REUSE;
3411 tp->repair_queue = TCP_NO_QUEUE;
3412 } else if (val == TCP_REPAIR_OFF) {
3413 tp->repair = 0;
3414 sk->sk_reuse = SK_NO_REUSE;
3415 tcp_send_window_probe(sk);
3416 } else if (val == TCP_REPAIR_OFF_NO_WP) {
3417 tp->repair = 0;
3418 sk->sk_reuse = SK_NO_REUSE;
3419 } else
3420 err = -EINVAL;
3421
3422 break;
3423
3424 case TCP_REPAIR_QUEUE:
3425 if (!tp->repair)
3426 err = -EPERM;
3427 else if ((unsigned int)val < TCP_QUEUES_NR)
3428 tp->repair_queue = val;
3429 else
3430 err = -EINVAL;
3431 break;
3432
3433 case TCP_QUEUE_SEQ:
3434 if (sk->sk_state != TCP_CLOSE)
3435 err = -EPERM;
3436 else if (tp->repair_queue == TCP_SEND_QUEUE)
3437 WRITE_ONCE(tp->write_seq, val);
3438 else if (tp->repair_queue == TCP_RECV_QUEUE) {
3439 WRITE_ONCE(tp->rcv_nxt, val);
3440 WRITE_ONCE(tp->copied_seq, val);
3441 }
3442 else
3443 err = -EINVAL;
3444 break;
3445
3446 case TCP_REPAIR_OPTIONS:
3447 if (!tp->repair)
3448 err = -EINVAL;
3449 else if (sk->sk_state == TCP_ESTABLISHED)
3450 err = tcp_repair_options_est(sk, optval, optlen);
3451 else
3452 err = -EPERM;
3453 break;
3454
3455 case TCP_CORK:
3456 __tcp_sock_set_cork(sk, val);
3457 break;
3458
3459 case TCP_KEEPIDLE:
3460 err = tcp_sock_set_keepidle_locked(sk, val);
3461 break;
3462 case TCP_KEEPINTVL:
3463 if (val < 1 || val > MAX_TCP_KEEPINTVL)
3464 err = -EINVAL;
3465 else
3466 tp->keepalive_intvl = val * HZ;
3467 break;
3468 case TCP_KEEPCNT:
3469 if (val < 1 || val > MAX_TCP_KEEPCNT)
3470 err = -EINVAL;
3471 else
3472 tp->keepalive_probes = val;
3473 break;
3474 case TCP_SYNCNT:
3475 if (val < 1 || val > MAX_TCP_SYNCNT)
3476 err = -EINVAL;
3477 else
3478 icsk->icsk_syn_retries = val;
3479 break;
3480
3481 case TCP_SAVE_SYN:
3482
3483 if (val < 0 || val > 2)
3484 err = -EINVAL;
3485 else
3486 tp->save_syn = val;
3487 break;
3488
3489 case TCP_LINGER2:
3490 if (val < 0)
3491 tp->linger2 = -1;
3492 else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
3493 tp->linger2 = TCP_FIN_TIMEOUT_MAX;
3494 else
3495 tp->linger2 = val * HZ;
3496 break;
3497
3498 case TCP_DEFER_ACCEPT:
3499
3500 icsk->icsk_accept_queue.rskq_defer_accept =
3501 secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
3502 TCP_RTO_MAX / HZ);
3503 break;
3504
3505 case TCP_WINDOW_CLAMP:
3506 err = tcp_set_window_clamp(sk, val);
3507 break;
3508
3509 case TCP_QUICKACK:
3510 __tcp_sock_set_quickack(sk, val);
3511 break;
3512
3513#ifdef CONFIG_TCP_MD5SIG
3514 case TCP_MD5SIG:
3515 case TCP_MD5SIG_EXT:
3516 err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
3517 break;
3518#endif
3519 case TCP_USER_TIMEOUT:
3520
3521
3522
3523 if (val < 0)
3524 err = -EINVAL;
3525 else
3526 icsk->icsk_user_timeout = val;
3527 break;
3528
3529 case TCP_FASTOPEN:
3530 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
3531 TCPF_LISTEN))) {
3532 tcp_fastopen_init_key_once(net);
3533
3534 fastopen_queue_tune(sk, val);
3535 } else {
3536 err = -EINVAL;
3537 }
3538 break;
3539 case TCP_FASTOPEN_CONNECT:
3540 if (val > 1 || val < 0) {
3541 err = -EINVAL;
3542 } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
3543 if (sk->sk_state == TCP_CLOSE)
3544 tp->fastopen_connect = val;
3545 else
3546 err = -EINVAL;
3547 } else {
3548 err = -EOPNOTSUPP;
3549 }
3550 break;
3551 case TCP_FASTOPEN_NO_COOKIE:
3552 if (val > 1 || val < 0)
3553 err = -EINVAL;
3554 else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3555 err = -EINVAL;
3556 else
3557 tp->fastopen_no_cookie = val;
3558 break;
3559 case TCP_TIMESTAMP:
3560 if (!tp->repair)
3561 err = -EPERM;
3562 else
3563 tp->tsoffset = val - tcp_time_stamp_raw();
3564 break;
3565 case TCP_REPAIR_WINDOW:
3566 err = tcp_repair_set_window(tp, optval, optlen);
3567 break;
3568 case TCP_NOTSENT_LOWAT:
3569 tp->notsent_lowat = val;
3570 sk->sk_write_space(sk);
3571 break;
3572 case TCP_INQ:
3573 if (val > 1 || val < 0)
3574 err = -EINVAL;
3575 else
3576 tp->recvmsg_inq = val;
3577 break;
3578 case TCP_TX_DELAY:
3579 if (val)
3580 tcp_enable_tx_delay();
3581 tp->tcp_tx_delay = val;
3582 break;
3583 default:
3584 err = -ENOPROTOOPT;
3585 break;
3586 }
3587
3588 release_sock(sk);
3589 return err;
3590}
3591
3592int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
3593 unsigned int optlen)
3594{
3595 const struct inet_connection_sock *icsk = inet_csk(sk);
3596
3597 if (level != SOL_TCP)
3598 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
3599 optval, optlen);
3600 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
3601}
3602EXPORT_SYMBOL(tcp_setsockopt);
3603
3604static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
3605 struct tcp_info *info)
3606{
3607 u64 stats[__TCP_CHRONO_MAX], total = 0;
3608 enum tcp_chrono i;
3609
3610 for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
3611 stats[i] = tp->chrono_stat[i - 1];
3612 if (i == tp->chrono_type)
3613 stats[i] += tcp_jiffies32 - tp->chrono_start;
3614 stats[i] *= USEC_PER_SEC / HZ;
3615 total += stats[i];
3616 }
3617
3618 info->tcpi_busy_time = total;
3619 info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
3620 info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
3621}
3622
3623
3624void tcp_get_info(struct sock *sk, struct tcp_info *info)
3625{
3626 const struct tcp_sock *tp = tcp_sk(sk);
3627 const struct inet_connection_sock *icsk = inet_csk(sk);
3628 unsigned long rate;
3629 u32 now;
3630 u64 rate64;
3631 bool slow;
3632
3633 memset(info, 0, sizeof(*info));
3634 if (sk->sk_type != SOCK_STREAM)
3635 return;
3636
3637 info->tcpi_state = inet_sk_state_load(sk);
3638
3639
3640 rate = READ_ONCE(sk->sk_pacing_rate);
3641 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3642 info->tcpi_pacing_rate = rate64;
3643
3644 rate = READ_ONCE(sk->sk_max_pacing_rate);
3645 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3646 info->tcpi_max_pacing_rate = rate64;
3647
3648 info->tcpi_reordering = tp->reordering;
3649 info->tcpi_snd_cwnd = tp->snd_cwnd;
3650
3651 if (info->tcpi_state == TCP_LISTEN) {
3652
3653
3654
3655
3656 info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
3657 info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog);
3658 return;
3659 }
3660
3661 slow = lock_sock_fast(sk);
3662
3663 info->tcpi_ca_state = icsk->icsk_ca_state;
3664 info->tcpi_retransmits = icsk->icsk_retransmits;
3665 info->tcpi_probes = icsk->icsk_probes_out;
3666 info->tcpi_backoff = icsk->icsk_backoff;
3667
3668 if (tp->rx_opt.tstamp_ok)
3669 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
3670 if (tcp_is_sack(tp))
3671 info->tcpi_options |= TCPI_OPT_SACK;
3672 if (tp->rx_opt.wscale_ok) {
3673 info->tcpi_options |= TCPI_OPT_WSCALE;
3674 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
3675 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
3676 }
3677
3678 if (tp->ecn_flags & TCP_ECN_OK)
3679 info->tcpi_options |= TCPI_OPT_ECN;
3680 if (tp->ecn_flags & TCP_ECN_SEEN)
3681 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
3682 if (tp->syn_data_acked)
3683 info->tcpi_options |= TCPI_OPT_SYN_DATA;
3684
3685 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
3686 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
3687 info->tcpi_snd_mss = tp->mss_cache;
3688 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
3689
3690 info->tcpi_unacked = tp->packets_out;
3691 info->tcpi_sacked = tp->sacked_out;
3692
3693 info->tcpi_lost = tp->lost_out;
3694 info->tcpi_retrans = tp->retrans_out;
3695
3696 now = tcp_jiffies32;
3697 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
3698 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
3699 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
3700
3701 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
3702 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
3703 info->tcpi_rtt = tp->srtt_us >> 3;
3704 info->tcpi_rttvar = tp->mdev_us >> 2;
3705 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
3706 info->tcpi_advmss = tp->advmss;
3707
3708 info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
3709 info->tcpi_rcv_space = tp->rcvq_space.space;
3710
3711 info->tcpi_total_retrans = tp->total_retrans;
3712
3713 info->tcpi_bytes_acked = tp->bytes_acked;
3714 info->tcpi_bytes_received = tp->bytes_received;
3715 info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
3716 tcp_get_info_chrono_stats(tp, info);
3717
3718 info->tcpi_segs_out = tp->segs_out;
3719 info->tcpi_segs_in = tp->segs_in;
3720
3721 info->tcpi_min_rtt = tcp_min_rtt(tp);
3722 info->tcpi_data_segs_in = tp->data_segs_in;
3723 info->tcpi_data_segs_out = tp->data_segs_out;
3724
3725 info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
3726 rate64 = tcp_compute_delivery_rate(tp);
3727 if (rate64)
3728 info->tcpi_delivery_rate = rate64;
3729 info->tcpi_delivered = tp->delivered;
3730 info->tcpi_delivered_ce = tp->delivered_ce;
3731 info->tcpi_bytes_sent = tp->bytes_sent;
3732 info->tcpi_bytes_retrans = tp->bytes_retrans;
3733 info->tcpi_dsack_dups = tp->dsack_dups;
3734 info->tcpi_reord_seen = tp->reord_seen;
3735 info->tcpi_rcv_ooopack = tp->rcv_ooopack;
3736 info->tcpi_snd_wnd = tp->snd_wnd;
3737 info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
3738 unlock_sock_fast(sk, slow);
3739}
3740EXPORT_SYMBOL_GPL(tcp_get_info);
3741
3742static size_t tcp_opt_stats_get_size(void)
3743{
3744 return
3745 nla_total_size_64bit(sizeof(u64)) +
3746 nla_total_size_64bit(sizeof(u64)) +
3747 nla_total_size_64bit(sizeof(u64)) +
3748 nla_total_size_64bit(sizeof(u64)) +
3749 nla_total_size_64bit(sizeof(u64)) +
3750 nla_total_size_64bit(sizeof(u64)) +
3751 nla_total_size_64bit(sizeof(u64)) +
3752 nla_total_size(sizeof(u32)) +
3753 nla_total_size(sizeof(u32)) +
3754 nla_total_size(sizeof(u32)) +
3755 nla_total_size(sizeof(u8)) +
3756 nla_total_size(sizeof(u8)) +
3757 nla_total_size(sizeof(u32)) +
3758 nla_total_size(sizeof(u8)) +
3759 nla_total_size(sizeof(u32)) +
3760 nla_total_size(sizeof(u32)) +
3761 nla_total_size(sizeof(u32)) +
3762 nla_total_size_64bit(sizeof(u64)) +
3763 nla_total_size_64bit(sizeof(u64)) +
3764 nla_total_size(sizeof(u32)) +
3765 nla_total_size(sizeof(u32)) +
3766 nla_total_size(sizeof(u32)) +
3767 nla_total_size(sizeof(u16)) +
3768 nla_total_size(sizeof(u32)) +
3769 nla_total_size_64bit(sizeof(u64)) +
3770 0;
3771}
3772
3773struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
3774 const struct sk_buff *orig_skb)
3775{
3776 const struct tcp_sock *tp = tcp_sk(sk);
3777 struct sk_buff *stats;
3778 struct tcp_info info;
3779 unsigned long rate;
3780 u64 rate64;
3781
3782 stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
3783 if (!stats)
3784 return NULL;
3785
3786 tcp_get_info_chrono_stats(tp, &info);
3787 nla_put_u64_64bit(stats, TCP_NLA_BUSY,
3788 info.tcpi_busy_time, TCP_NLA_PAD);
3789 nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
3790 info.tcpi_rwnd_limited, TCP_NLA_PAD);
3791 nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
3792 info.tcpi_sndbuf_limited, TCP_NLA_PAD);
3793 nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
3794 tp->data_segs_out, TCP_NLA_PAD);
3795 nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
3796 tp->total_retrans, TCP_NLA_PAD);
3797
3798 rate = READ_ONCE(sk->sk_pacing_rate);
3799 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3800 nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
3801
3802 rate64 = tcp_compute_delivery_rate(tp);
3803 nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
3804
3805 nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
3806 nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
3807 nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
3808
3809 nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
3810 nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
3811 nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
3812 nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
3813 nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
3814
3815 nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
3816 nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
3817
3818 nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
3819 TCP_NLA_PAD);
3820 nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
3821 TCP_NLA_PAD);
3822 nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
3823 nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
3824 nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
3825 nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
3826 nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
3827 max_t(int, 0, tp->write_seq - tp->snd_nxt));
3828 nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
3829 TCP_NLA_PAD);
3830
3831 return stats;
3832}
3833
3834static int do_tcp_getsockopt(struct sock *sk, int level,
3835 int optname, char __user *optval, int __user *optlen)
3836{
3837 struct inet_connection_sock *icsk = inet_csk(sk);
3838 struct tcp_sock *tp = tcp_sk(sk);
3839 struct net *net = sock_net(sk);
3840 int val, len;
3841
3842 if (get_user(len, optlen))
3843 return -EFAULT;
3844
3845 len = min_t(unsigned int, len, sizeof(int));
3846
3847 if (len < 0)
3848 return -EINVAL;
3849
3850 switch (optname) {
3851 case TCP_MAXSEG:
3852 val = tp->mss_cache;
3853 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3854 val = tp->rx_opt.user_mss;
3855 if (tp->repair)
3856 val = tp->rx_opt.mss_clamp;
3857 break;
3858 case TCP_NODELAY:
3859 val = !!(tp->nonagle&TCP_NAGLE_OFF);
3860 break;
3861 case TCP_CORK:
3862 val = !!(tp->nonagle&TCP_NAGLE_CORK);
3863 break;
3864 case TCP_KEEPIDLE:
3865 val = keepalive_time_when(tp) / HZ;
3866 break;
3867 case TCP_KEEPINTVL:
3868 val = keepalive_intvl_when(tp) / HZ;
3869 break;
3870 case TCP_KEEPCNT:
3871 val = keepalive_probes(tp);
3872 break;
3873 case TCP_SYNCNT:
3874 val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
3875 break;
3876 case TCP_LINGER2:
3877 val = tp->linger2;
3878 if (val >= 0)
3879 val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
3880 break;
3881 case TCP_DEFER_ACCEPT:
3882 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
3883 TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
3884 break;
3885 case TCP_WINDOW_CLAMP:
3886 val = tp->window_clamp;
3887 break;
3888 case TCP_INFO: {
3889 struct tcp_info info;
3890
3891 if (get_user(len, optlen))
3892 return -EFAULT;
3893
3894 tcp_get_info(sk, &info);
3895
3896 len = min_t(unsigned int, len, sizeof(info));
3897 if (put_user(len, optlen))
3898 return -EFAULT;
3899 if (copy_to_user(optval, &info, len))
3900 return -EFAULT;
3901 return 0;
3902 }
3903 case TCP_CC_INFO: {
3904 const struct tcp_congestion_ops *ca_ops;
3905 union tcp_cc_info info;
3906 size_t sz = 0;
3907 int attr;
3908
3909 if (get_user(len, optlen))
3910 return -EFAULT;
3911
3912 ca_ops = icsk->icsk_ca_ops;
3913 if (ca_ops && ca_ops->get_info)
3914 sz = ca_ops->get_info(sk, ~0U, &attr, &info);
3915
3916 len = min_t(unsigned int, len, sz);
3917 if (put_user(len, optlen))
3918 return -EFAULT;
3919 if (copy_to_user(optval, &info, len))
3920 return -EFAULT;
3921 return 0;
3922 }
3923 case TCP_QUICKACK:
3924 val = !inet_csk_in_pingpong_mode(sk);
3925 break;
3926
3927 case TCP_CONGESTION:
3928 if (get_user(len, optlen))
3929 return -EFAULT;
3930 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
3931 if (put_user(len, optlen))
3932 return -EFAULT;
3933 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
3934 return -EFAULT;
3935 return 0;
3936
3937 case TCP_ULP:
3938 if (get_user(len, optlen))
3939 return -EFAULT;
3940 len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
3941 if (!icsk->icsk_ulp_ops) {
3942 if (put_user(0, optlen))
3943 return -EFAULT;
3944 return 0;
3945 }
3946 if (put_user(len, optlen))
3947 return -EFAULT;
3948 if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
3949 return -EFAULT;
3950 return 0;
3951
3952 case TCP_FASTOPEN_KEY: {
3953 u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)];
3954 unsigned int key_len;
3955
3956 if (get_user(len, optlen))
3957 return -EFAULT;
3958
3959 key_len = tcp_fastopen_get_cipher(net, icsk, key) *
3960 TCP_FASTOPEN_KEY_LENGTH;
3961 len = min_t(unsigned int, len, key_len);
3962 if (put_user(len, optlen))
3963 return -EFAULT;
3964 if (copy_to_user(optval, key, len))
3965 return -EFAULT;
3966 return 0;
3967 }
3968 case TCP_THIN_LINEAR_TIMEOUTS:
3969 val = tp->thin_lto;
3970 break;
3971
3972 case TCP_THIN_DUPACK:
3973 val = 0;
3974 break;
3975
3976 case TCP_REPAIR:
3977 val = tp->repair;
3978 break;
3979
3980 case TCP_REPAIR_QUEUE:
3981 if (tp->repair)
3982 val = tp->repair_queue;
3983 else
3984 return -EINVAL;
3985 break;
3986
3987 case TCP_REPAIR_WINDOW: {
3988 struct tcp_repair_window opt;
3989
3990 if (get_user(len, optlen))
3991 return -EFAULT;
3992
3993 if (len != sizeof(opt))
3994 return -EINVAL;
3995
3996 if (!tp->repair)
3997 return -EPERM;
3998
3999 opt.snd_wl1 = tp->snd_wl1;
4000 opt.snd_wnd = tp->snd_wnd;
4001 opt.max_window = tp->max_window;
4002 opt.rcv_wnd = tp->rcv_wnd;
4003 opt.rcv_wup = tp->rcv_wup;
4004
4005 if (copy_to_user(optval, &opt, len))
4006 return -EFAULT;
4007 return 0;
4008 }
4009 case TCP_QUEUE_SEQ:
4010 if (tp->repair_queue == TCP_SEND_QUEUE)
4011 val = tp->write_seq;
4012 else if (tp->repair_queue == TCP_RECV_QUEUE)
4013 val = tp->rcv_nxt;
4014 else
4015 return -EINVAL;
4016 break;
4017
4018 case TCP_USER_TIMEOUT:
4019 val = icsk->icsk_user_timeout;
4020 break;
4021
4022 case TCP_FASTOPEN:
4023 val = icsk->icsk_accept_queue.fastopenq.max_qlen;
4024 break;
4025
4026 case TCP_FASTOPEN_CONNECT:
4027 val = tp->fastopen_connect;
4028 break;
4029
4030 case TCP_FASTOPEN_NO_COOKIE:
4031 val = tp->fastopen_no_cookie;
4032 break;
4033
4034 case TCP_TX_DELAY:
4035 val = tp->tcp_tx_delay;
4036 break;
4037
4038 case TCP_TIMESTAMP:
4039 val = tcp_time_stamp_raw() + tp->tsoffset;
4040 break;
4041 case TCP_NOTSENT_LOWAT:
4042 val = tp->notsent_lowat;
4043 break;
4044 case TCP_INQ:
4045 val = tp->recvmsg_inq;
4046 break;
4047 case TCP_SAVE_SYN:
4048 val = tp->save_syn;
4049 break;
4050 case TCP_SAVED_SYN: {
4051 if (get_user(len, optlen))
4052 return -EFAULT;
4053
4054 lock_sock(sk);
4055 if (tp->saved_syn) {
4056 if (len < tcp_saved_syn_len(tp->saved_syn)) {
4057 if (put_user(tcp_saved_syn_len(tp->saved_syn),
4058 optlen)) {
4059 release_sock(sk);
4060 return -EFAULT;
4061 }
4062 release_sock(sk);
4063 return -EINVAL;
4064 }
4065 len = tcp_saved_syn_len(tp->saved_syn);
4066 if (put_user(len, optlen)) {
4067 release_sock(sk);
4068 return -EFAULT;
4069 }
4070 if (copy_to_user(optval, tp->saved_syn->data, len)) {
4071 release_sock(sk);
4072 return -EFAULT;
4073 }
4074 tcp_saved_syn_free(tp);
4075 release_sock(sk);
4076 } else {
4077 release_sock(sk);
4078 len = 0;
4079 if (put_user(len, optlen))
4080 return -EFAULT;
4081 }
4082 return 0;
4083 }
4084#ifdef CONFIG_MMU
4085 case TCP_ZEROCOPY_RECEIVE: {
4086 struct tcp_zerocopy_receive zc = {};
4087 int err;
4088
4089 if (get_user(len, optlen))
4090 return -EFAULT;
4091 if (len < offsetofend(struct tcp_zerocopy_receive, length))
4092 return -EINVAL;
4093 if (len > sizeof(zc)) {
4094 len = sizeof(zc);
4095 if (put_user(len, optlen))
4096 return -EFAULT;
4097 }
4098 if (copy_from_user(&zc, optval, len))
4099 return -EFAULT;
4100 lock_sock(sk);
4101 err = tcp_zerocopy_receive(sk, &zc);
4102 release_sock(sk);
4103 if (len >= offsetofend(struct tcp_zerocopy_receive, err))
4104 goto zerocopy_rcv_sk_err;
4105 switch (len) {
4106 case offsetofend(struct tcp_zerocopy_receive, err):
4107 goto zerocopy_rcv_sk_err;
4108 case offsetofend(struct tcp_zerocopy_receive, inq):
4109 goto zerocopy_rcv_inq;
4110 case offsetofend(struct tcp_zerocopy_receive, length):
4111 default:
4112 goto zerocopy_rcv_out;
4113 }
4114zerocopy_rcv_sk_err:
4115 if (!err)
4116 zc.err = sock_error(sk);
4117zerocopy_rcv_inq:
4118 zc.inq = tcp_inq_hint(sk);
4119zerocopy_rcv_out:
4120 if (!err && copy_to_user(optval, &zc, len))
4121 err = -EFAULT;
4122 return err;
4123 }
4124#endif
4125 default:
4126 return -ENOPROTOOPT;
4127 }
4128
4129 if (put_user(len, optlen))
4130 return -EFAULT;
4131 if (copy_to_user(optval, &val, len))
4132 return -EFAULT;
4133 return 0;
4134}
4135
4136int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
4137 int __user *optlen)
4138{
4139 struct inet_connection_sock *icsk = inet_csk(sk);
4140
4141 if (level != SOL_TCP)
4142 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
4143 optval, optlen);
4144 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
4145}
4146EXPORT_SYMBOL(tcp_getsockopt);
4147
4148#ifdef CONFIG_TCP_MD5SIG
4149static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
4150static DEFINE_MUTEX(tcp_md5sig_mutex);
4151static bool tcp_md5sig_pool_populated = false;
4152
4153static void __tcp_alloc_md5sig_pool(void)
4154{
4155 struct crypto_ahash *hash;
4156 int cpu;
4157
4158 hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
4159 if (IS_ERR(hash))
4160 return;
4161
4162 for_each_possible_cpu(cpu) {
4163 void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
4164 struct ahash_request *req;
4165
4166 if (!scratch) {
4167 scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
4168 sizeof(struct tcphdr),
4169 GFP_KERNEL,
4170 cpu_to_node(cpu));
4171 if (!scratch)
4172 return;
4173 per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
4174 }
4175 if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
4176 continue;
4177
4178 req = ahash_request_alloc(hash, GFP_KERNEL);
4179 if (!req)
4180 return;
4181
4182 ahash_request_set_callback(req, 0, NULL, NULL);
4183
4184 per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
4185 }
4186
4187
4188
4189 smp_wmb();
4190 tcp_md5sig_pool_populated = true;
4191}
4192
4193bool tcp_alloc_md5sig_pool(void)
4194{
4195 if (unlikely(!tcp_md5sig_pool_populated)) {
4196 mutex_lock(&tcp_md5sig_mutex);
4197
4198 if (!tcp_md5sig_pool_populated) {
4199 __tcp_alloc_md5sig_pool();
4200 if (tcp_md5sig_pool_populated)
4201 static_branch_inc(&tcp_md5_needed);
4202 }
4203
4204 mutex_unlock(&tcp_md5sig_mutex);
4205 }
4206 return tcp_md5sig_pool_populated;
4207}
4208EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
4219{
4220 local_bh_disable();
4221
4222 if (tcp_md5sig_pool_populated) {
4223
4224 smp_rmb();
4225 return this_cpu_ptr(&tcp_md5sig_pool);
4226 }
4227 local_bh_enable();
4228 return NULL;
4229}
4230EXPORT_SYMBOL(tcp_get_md5sig_pool);
4231
4232int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
4233 const struct sk_buff *skb, unsigned int header_len)
4234{
4235 struct scatterlist sg;
4236 const struct tcphdr *tp = tcp_hdr(skb);
4237 struct ahash_request *req = hp->md5_req;
4238 unsigned int i;
4239 const unsigned int head_data_len = skb_headlen(skb) > header_len ?
4240 skb_headlen(skb) - header_len : 0;
4241 const struct skb_shared_info *shi = skb_shinfo(skb);
4242 struct sk_buff *frag_iter;
4243
4244 sg_init_table(&sg, 1);
4245
4246 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
4247 ahash_request_set_crypt(req, &sg, NULL, head_data_len);
4248 if (crypto_ahash_update(req))
4249 return 1;
4250
4251 for (i = 0; i < shi->nr_frags; ++i) {
4252 const skb_frag_t *f = &shi->frags[i];
4253 unsigned int offset = skb_frag_off(f);
4254 struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
4255
4256 sg_set_page(&sg, page, skb_frag_size(f),
4257 offset_in_page(offset));
4258 ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
4259 if (crypto_ahash_update(req))
4260 return 1;
4261 }
4262
4263 skb_walk_frags(skb, frag_iter)
4264 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
4265 return 1;
4266
4267 return 0;
4268}
4269EXPORT_SYMBOL(tcp_md5_hash_skb_data);
4270
4271int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
4272{
4273 u8 keylen = READ_ONCE(key->keylen);
4274 struct scatterlist sg;
4275
4276 sg_init_one(&sg, key->key, keylen);
4277 ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen);
4278
4279
4280 return data_race(crypto_ahash_update(hp->md5_req));
4281}
4282EXPORT_SYMBOL(tcp_md5_hash_key);
4283
4284#endif
4285
4286void tcp_done(struct sock *sk)
4287{
4288 struct request_sock *req;
4289
4290
4291
4292
4293
4294 req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);
4295
4296 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
4297 TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
4298
4299 tcp_set_state(sk, TCP_CLOSE);
4300 tcp_clear_xmit_timers(sk);
4301 if (req)
4302 reqsk_fastopen_remove(sk, req, false);
4303
4304 sk->sk_shutdown = SHUTDOWN_MASK;
4305
4306 if (!sock_flag(sk, SOCK_DEAD))
4307 sk->sk_state_change(sk);
4308 else
4309 inet_csk_destroy_sock(sk);
4310}
4311EXPORT_SYMBOL_GPL(tcp_done);
4312
4313int tcp_abort(struct sock *sk, int err)
4314{
4315 if (!sk_fullsock(sk)) {
4316 if (sk->sk_state == TCP_NEW_SYN_RECV) {
4317 struct request_sock *req = inet_reqsk(sk);
4318
4319 local_bh_disable();
4320 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
4321 local_bh_enable();
4322 return 0;
4323 }
4324 return -EOPNOTSUPP;
4325 }
4326
4327
4328 lock_sock(sk);
4329
4330 if (sk->sk_state == TCP_LISTEN) {
4331 tcp_set_state(sk, TCP_CLOSE);
4332 inet_csk_listen_stop(sk);
4333 }
4334
4335
4336 local_bh_disable();
4337 bh_lock_sock(sk);
4338
4339 if (!sock_flag(sk, SOCK_DEAD)) {
4340 sk->sk_err = err;
4341
4342 smp_wmb();
4343 sk->sk_error_report(sk);
4344 if (tcp_need_reset(sk->sk_state))
4345 tcp_send_active_reset(sk, GFP_ATOMIC);
4346 tcp_done(sk);
4347 }
4348
4349 bh_unlock_sock(sk);
4350 local_bh_enable();
4351 tcp_write_queue_purge(sk);
4352 release_sock(sk);
4353 return 0;
4354}
4355EXPORT_SYMBOL_GPL(tcp_abort);
4356
4357extern struct tcp_congestion_ops tcp_reno;
4358
4359static __initdata unsigned long thash_entries;
4360static int __init set_thash_entries(char *str)
4361{
4362 ssize_t ret;
4363
4364 if (!str)
4365 return 0;
4366
4367 ret = kstrtoul(str, 0, &thash_entries);
4368 if (ret)
4369 return 0;
4370
4371 return 1;
4372}
4373__setup("thash_entries=", set_thash_entries);
4374
4375static void __init tcp_init_mem(void)
4376{
4377 unsigned long limit = nr_free_buffer_pages() / 16;
4378
4379 limit = max(limit, 128UL);
4380 sysctl_tcp_mem[0] = limit / 4 * 3;
4381 sysctl_tcp_mem[1] = limit;
4382 sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
4383}
4384
4385void __init tcp_init(void)
4386{
4387 int max_rshare, max_wshare, cnt;
4388 unsigned long limit;
4389 unsigned int i;
4390
4391 BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
4392 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
4393 sizeof_field(struct sk_buff, cb));
4394
4395 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
4396 percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
4397 inet_hashinfo_init(&tcp_hashinfo);
4398 inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
4399 thash_entries, 21,
4400 0, 64 * 1024);
4401 tcp_hashinfo.bind_bucket_cachep =
4402 kmem_cache_create("tcp_bind_bucket",
4403 sizeof(struct inet_bind_bucket), 0,
4404 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
4405
4406
4407
4408
4409
4410
4411 tcp_hashinfo.ehash =
4412 alloc_large_system_hash("TCP established",
4413 sizeof(struct inet_ehash_bucket),
4414 thash_entries,
4415 17,
4416 0,
4417 NULL,
4418 &tcp_hashinfo.ehash_mask,
4419 0,
4420 thash_entries ? 0 : 512 * 1024);
4421 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
4422 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
4423
4424 if (inet_ehash_locks_alloc(&tcp_hashinfo))
4425 panic("TCP: failed to alloc ehash_locks");
4426 tcp_hashinfo.bhash =
4427 alloc_large_system_hash("TCP bind",
4428 sizeof(struct inet_bind_hashbucket),
4429 tcp_hashinfo.ehash_mask + 1,
4430 17,
4431 0,
4432 &tcp_hashinfo.bhash_size,
4433 NULL,
4434 0,
4435 64 * 1024);
4436 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
4437 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
4438 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
4439 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
4440 }
4441
4442
4443 cnt = tcp_hashinfo.ehash_mask + 1;
4444 sysctl_tcp_max_orphans = cnt / 2;
4445
4446 tcp_init_mem();
4447
4448 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
4449 max_wshare = min(4UL*1024*1024, limit);
4450 max_rshare = min(6UL*1024*1024, limit);
4451
4452 init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
4453 init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
4454 init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
4455
4456 init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
4457 init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
4458 init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
4459
4460 pr_info("Hash tables configured (established %u bind %u)\n",
4461 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
4462
4463 tcp_v4_init();
4464 tcp_metrics_init();
4465 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
4466 tcp_tasklet_init();
4467 mptcp_init();
4468}
4469