1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248#define pr_fmt(fmt) "TCP: " fmt
249
250#include <crypto/hash.h>
251#include <linux/kernel.h>
252#include <linux/module.h>
253#include <linux/types.h>
254#include <linux/fcntl.h>
255#include <linux/poll.h>
256#include <linux/inet_diag.h>
257#include <linux/init.h>
258#include <linux/fs.h>
259#include <linux/skbuff.h>
260#include <linux/scatterlist.h>
261#include <linux/splice.h>
262#include <linux/net.h>
263#include <linux/socket.h>
264#include <linux/random.h>
265#include <linux/bootmem.h>
266#include <linux/highmem.h>
267#include <linux/swap.h>
268#include <linux/cache.h>
269#include <linux/err.h>
270#include <linux/time.h>
271#include <linux/slab.h>
272#include <linux/errqueue.h>
273#include <linux/static_key.h>
274
275#include <net/icmp.h>
276#include <net/inet_common.h>
277#include <net/tcp.h>
278#include <net/xfrm.h>
279#include <net/ip.h>
280#include <net/sock.h>
281
282#include <linux/uaccess.h>
283#include <asm/ioctls.h>
284#include <net/busy_poll.h>
285
286struct percpu_counter tcp_orphan_count;
287EXPORT_SYMBOL_GPL(tcp_orphan_count);
288
289long sysctl_tcp_mem[3] __read_mostly;
290EXPORT_SYMBOL(sysctl_tcp_mem);
291
292atomic_long_t tcp_memory_allocated;
293EXPORT_SYMBOL(tcp_memory_allocated);
294
295#if IS_ENABLED(CONFIG_SMC)
296DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
297EXPORT_SYMBOL(tcp_have_smc);
298#endif
299
300
301
302
303struct percpu_counter tcp_sockets_allocated;
304EXPORT_SYMBOL(tcp_sockets_allocated);
305
306
307
308
309struct tcp_splice_state {
310 struct pipe_inode_info *pipe;
311 size_t len;
312 unsigned int flags;
313};
314
315
316
317
318
319
320
321unsigned long tcp_memory_pressure __read_mostly;
322EXPORT_SYMBOL_GPL(tcp_memory_pressure);
323
324void tcp_enter_memory_pressure(struct sock *sk)
325{
326 unsigned long val;
327
328 if (tcp_memory_pressure)
329 return;
330 val = jiffies;
331
332 if (!val)
333 val--;
334 if (!cmpxchg(&tcp_memory_pressure, 0, val))
335 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
336}
337EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
338
339void tcp_leave_memory_pressure(struct sock *sk)
340{
341 unsigned long val;
342
343 if (!tcp_memory_pressure)
344 return;
345 val = xchg(&tcp_memory_pressure, 0);
346 if (val)
347 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
348 jiffies_to_msecs(jiffies - val));
349}
350EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
351
352
353static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
354{
355 u8 res = 0;
356
357 if (seconds > 0) {
358 int period = timeout;
359
360 res = 1;
361 while (seconds > period && res < 255) {
362 res++;
363 timeout <<= 1;
364 if (timeout > rto_max)
365 timeout = rto_max;
366 period += timeout;
367 }
368 }
369 return res;
370}
371
372
373static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
374{
375 int period = 0;
376
377 if (retrans > 0) {
378 period = timeout;
379 while (--retrans) {
380 timeout <<= 1;
381 if (timeout > rto_max)
382 timeout = rto_max;
383 period += timeout;
384 }
385 }
386 return period;
387}
388
389static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
390{
391 u32 rate = READ_ONCE(tp->rate_delivered);
392 u32 intv = READ_ONCE(tp->rate_interval_us);
393 u64 rate64 = 0;
394
395 if (rate && intv) {
396 rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
397 do_div(rate64, intv);
398 }
399 return rate64;
400}
401
402
403
404
405
406
407void tcp_init_sock(struct sock *sk)
408{
409 struct inet_connection_sock *icsk = inet_csk(sk);
410 struct tcp_sock *tp = tcp_sk(sk);
411
412 tp->out_of_order_queue = RB_ROOT;
413 sk->tcp_rtx_queue = RB_ROOT;
414 tcp_init_xmit_timers(sk);
415 INIT_LIST_HEAD(&tp->tsq_node);
416 INIT_LIST_HEAD(&tp->tsorted_sent_queue);
417
418 icsk->icsk_rto = TCP_TIMEOUT_INIT;
419 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
420 minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
421
422
423
424
425
426
427 tp->snd_cwnd = TCP_INIT_CWND;
428
429
430 tp->app_limited = ~0U;
431
432
433
434
435 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
436 tp->snd_cwnd_clamp = ~0;
437 tp->mss_cache = TCP_MSS_DEFAULT;
438
439 tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
440 tcp_assign_congestion_control(sk);
441
442 tp->tsoffset = 0;
443 tp->rack.reo_wnd_steps = 1;
444
445 sk->sk_state = TCP_CLOSE;
446
447 sk->sk_write_space = sk_stream_write_space;
448 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
449
450 icsk->icsk_sync_mss = tcp_sync_mss;
451
452 sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
453 sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
454
455 sk_sockets_allocated_inc(sk);
456 sk->sk_route_forced_caps = NETIF_F_GSO;
457}
458EXPORT_SYMBOL(tcp_init_sock);
459
460void tcp_init_transfer(struct sock *sk, int bpf_op)
461{
462 struct inet_connection_sock *icsk = inet_csk(sk);
463
464 tcp_mtup_init(sk);
465 icsk->icsk_af_ops->rebuild_header(sk);
466 tcp_init_metrics(sk);
467 tcp_call_bpf(sk, bpf_op, 0, NULL);
468 tcp_init_congestion_control(sk);
469 tcp_init_buffer_space(sk);
470}
471
472static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
473{
474 struct sk_buff *skb = tcp_write_queue_tail(sk);
475
476 if (tsflags && skb) {
477 struct skb_shared_info *shinfo = skb_shinfo(skb);
478 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
479
480 sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
481 if (tsflags & SOF_TIMESTAMPING_TX_ACK)
482 tcb->txstamp_ack = 1;
483 if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
484 shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
485 }
486}
487
488static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
489 int target, struct sock *sk)
490{
491 return (tp->rcv_nxt - tp->copied_seq >= target) ||
492 (sk->sk_prot->stream_memory_read ?
493 sk->sk_prot->stream_memory_read(sk) : false);
494}
495
496
497
498
499
500
501
502
503__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
504{
505 __poll_t mask;
506 struct sock *sk = sock->sk;
507 const struct tcp_sock *tp = tcp_sk(sk);
508 int state;
509
510 sock_poll_wait(file, sk_sleep(sk), wait);
511
512 state = inet_sk_state_load(sk);
513 if (state == TCP_LISTEN)
514 return inet_csk_listen_poll(sk);
515
516
517
518
519
520
521 mask = 0;
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550 if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
551 mask |= EPOLLHUP;
552 if (sk->sk_shutdown & RCV_SHUTDOWN)
553 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
554
555
556 if (state != TCP_SYN_SENT &&
557 (state != TCP_SYN_RECV || tp->fastopen_rsk)) {
558 int target = sock_rcvlowat(sk, 0, INT_MAX);
559
560 if (tp->urg_seq == tp->copied_seq &&
561 !sock_flag(sk, SOCK_URGINLINE) &&
562 tp->urg_data)
563 target++;
564
565 if (tcp_stream_is_readable(tp, target, sk))
566 mask |= EPOLLIN | EPOLLRDNORM;
567
568 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
569 if (sk_stream_is_writeable(sk)) {
570 mask |= EPOLLOUT | EPOLLWRNORM;
571 } else {
572 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
573 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
574
575
576
577
578
579
580 smp_mb__after_atomic();
581 if (sk_stream_is_writeable(sk))
582 mask |= EPOLLOUT | EPOLLWRNORM;
583 }
584 } else
585 mask |= EPOLLOUT | EPOLLWRNORM;
586
587 if (tp->urg_data & TCP_URG_VALID)
588 mask |= EPOLLPRI;
589 } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
590
591
592
593
594 mask |= EPOLLOUT | EPOLLWRNORM;
595 }
596
597 smp_rmb();
598 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
599 mask |= EPOLLERR;
600
601 return mask;
602}
603EXPORT_SYMBOL(tcp_poll);
604
605int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
606{
607 struct tcp_sock *tp = tcp_sk(sk);
608 int answ;
609 bool slow;
610
611 switch (cmd) {
612 case SIOCINQ:
613 if (sk->sk_state == TCP_LISTEN)
614 return -EINVAL;
615
616 slow = lock_sock_fast(sk);
617 answ = tcp_inq(sk);
618 unlock_sock_fast(sk, slow);
619 break;
620 case SIOCATMARK:
621 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
622 break;
623 case SIOCOUTQ:
624 if (sk->sk_state == TCP_LISTEN)
625 return -EINVAL;
626
627 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
628 answ = 0;
629 else
630 answ = tp->write_seq - tp->snd_una;
631 break;
632 case SIOCOUTQNSD:
633 if (sk->sk_state == TCP_LISTEN)
634 return -EINVAL;
635
636 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
637 answ = 0;
638 else
639 answ = tp->write_seq - tp->snd_nxt;
640 break;
641 default:
642 return -ENOIOCTLCMD;
643 }
644
645 return put_user(answ, (int __user *)arg);
646}
647EXPORT_SYMBOL(tcp_ioctl);
648
649static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
650{
651 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
652 tp->pushed_seq = tp->write_seq;
653}
654
655static inline bool forced_push(const struct tcp_sock *tp)
656{
657 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
658}
659
660static void skb_entail(struct sock *sk, struct sk_buff *skb)
661{
662 struct tcp_sock *tp = tcp_sk(sk);
663 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
664
665 skb->csum = 0;
666 tcb->seq = tcb->end_seq = tp->write_seq;
667 tcb->tcp_flags = TCPHDR_ACK;
668 tcb->sacked = 0;
669 __skb_header_release(skb);
670 tcp_add_write_queue_tail(sk, skb);
671 sk->sk_wmem_queued += skb->truesize;
672 sk_mem_charge(sk, skb->truesize);
673 if (tp->nonagle & TCP_NAGLE_PUSH)
674 tp->nonagle &= ~TCP_NAGLE_PUSH;
675
676 tcp_slow_start_after_idle_check(sk);
677}
678
679static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
680{
681 if (flags & MSG_OOB)
682 tp->snd_up = tp->write_seq;
683}
684
685
686
687
688
689
690
691
692
693
694
695static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
696 int size_goal)
697{
698 return skb->len < size_goal &&
699 sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
700 !tcp_rtx_queue_empty(sk) &&
701 refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
702}
703
704static void tcp_push(struct sock *sk, int flags, int mss_now,
705 int nonagle, int size_goal)
706{
707 struct tcp_sock *tp = tcp_sk(sk);
708 struct sk_buff *skb;
709
710 skb = tcp_write_queue_tail(sk);
711 if (!skb)
712 return;
713 if (!(flags & MSG_MORE) || forced_push(tp))
714 tcp_mark_push(tp, skb);
715
716 tcp_mark_urg(tp, flags);
717
718 if (tcp_should_autocork(sk, skb, size_goal)) {
719
720
721 if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
722 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
723 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
724 }
725
726
727
728 if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
729 return;
730 }
731
732 if (flags & MSG_MORE)
733 nonagle = TCP_NAGLE_CORK;
734
735 __tcp_push_pending_frames(sk, mss_now, nonagle);
736}
737
738static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
739 unsigned int offset, size_t len)
740{
741 struct tcp_splice_state *tss = rd_desc->arg.data;
742 int ret;
743
744 ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
745 min(rd_desc->count, len), tss->flags);
746 if (ret > 0)
747 rd_desc->count -= ret;
748 return ret;
749}
750
751static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
752{
753
754 read_descriptor_t rd_desc = {
755 .arg.data = tss,
756 .count = tss->len,
757 };
758
759 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
760}
761
762
763
764
765
766
767
768
769
770
771
772
773
774ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
775 struct pipe_inode_info *pipe, size_t len,
776 unsigned int flags)
777{
778 struct sock *sk = sock->sk;
779 struct tcp_splice_state tss = {
780 .pipe = pipe,
781 .len = len,
782 .flags = flags,
783 };
784 long timeo;
785 ssize_t spliced;
786 int ret;
787
788 sock_rps_record_flow(sk);
789
790
791
792 if (unlikely(*ppos))
793 return -ESPIPE;
794
795 ret = spliced = 0;
796
797 lock_sock(sk);
798
799 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
800 while (tss.len) {
801 ret = __tcp_splice_read(sk, &tss);
802 if (ret < 0)
803 break;
804 else if (!ret) {
805 if (spliced)
806 break;
807 if (sock_flag(sk, SOCK_DONE))
808 break;
809 if (sk->sk_err) {
810 ret = sock_error(sk);
811 break;
812 }
813 if (sk->sk_shutdown & RCV_SHUTDOWN)
814 break;
815 if (sk->sk_state == TCP_CLOSE) {
816
817
818
819
820 if (!sock_flag(sk, SOCK_DONE))
821 ret = -ENOTCONN;
822 break;
823 }
824 if (!timeo) {
825 ret = -EAGAIN;
826 break;
827 }
828
829
830
831
832 if (!skb_queue_empty(&sk->sk_receive_queue))
833 break;
834 sk_wait_data(sk, &timeo, NULL);
835 if (signal_pending(current)) {
836 ret = sock_intr_errno(timeo);
837 break;
838 }
839 continue;
840 }
841 tss.len -= ret;
842 spliced += ret;
843
844 if (!timeo)
845 break;
846 release_sock(sk);
847 lock_sock(sk);
848
849 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
850 (sk->sk_shutdown & RCV_SHUTDOWN) ||
851 signal_pending(current))
852 break;
853 }
854
855 release_sock(sk);
856
857 if (spliced)
858 return spliced;
859
860 return ret;
861}
862EXPORT_SYMBOL(tcp_splice_read);
863
864struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
865 bool force_schedule)
866{
867 struct sk_buff *skb;
868
869
870 size = ALIGN(size, 4);
871
872 if (unlikely(tcp_under_memory_pressure(sk)))
873 sk_mem_reclaim_partial(sk);
874
875 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
876 if (likely(skb)) {
877 bool mem_scheduled;
878
879 if (force_schedule) {
880 mem_scheduled = true;
881 sk_forced_mem_schedule(sk, skb->truesize);
882 } else {
883 mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
884 }
885 if (likely(mem_scheduled)) {
886 skb_reserve(skb, sk->sk_prot->max_header);
887
888
889
890
891 skb->reserved_tailroom = skb->end - skb->tail - size;
892 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
893 return skb;
894 }
895 __kfree_skb(skb);
896 } else {
897 sk->sk_prot->enter_memory_pressure(sk);
898 sk_stream_moderate_sndbuf(sk);
899 }
900 return NULL;
901}
902
903static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
904 int large_allowed)
905{
906 struct tcp_sock *tp = tcp_sk(sk);
907 u32 new_size_goal, size_goal;
908
909 if (!large_allowed)
910 return mss_now;
911
912
913 new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
914 new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
915
916
917 size_goal = tp->gso_segs * mss_now;
918 if (unlikely(new_size_goal < size_goal ||
919 new_size_goal >= size_goal + mss_now)) {
920 tp->gso_segs = min_t(u16, new_size_goal / mss_now,
921 sk->sk_gso_max_segs);
922 size_goal = tp->gso_segs * mss_now;
923 }
924
925 return max(size_goal, mss_now);
926}
927
928static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
929{
930 int mss_now;
931
932 mss_now = tcp_current_mss(sk);
933 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
934
935 return mss_now;
936}
937
938ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
939 size_t size, int flags)
940{
941 struct tcp_sock *tp = tcp_sk(sk);
942 int mss_now, size_goal;
943 int err;
944 ssize_t copied;
945 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
946
947
948
949
950
951 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
952 !tcp_passive_fastopen(sk)) {
953 err = sk_stream_wait_connect(sk, &timeo);
954 if (err != 0)
955 goto out_err;
956 }
957
958 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
959
960 mss_now = tcp_send_mss(sk, &size_goal, flags);
961 copied = 0;
962
963 err = -EPIPE;
964 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
965 goto out_err;
966
967 while (size > 0) {
968 struct sk_buff *skb = tcp_write_queue_tail(sk);
969 int copy, i;
970 bool can_coalesce;
971
972 if (!skb || (copy = size_goal - skb->len) <= 0 ||
973 !tcp_skb_can_collapse_to(skb)) {
974new_segment:
975 if (!sk_stream_memory_free(sk))
976 goto wait_for_sndbuf;
977
978 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
979 tcp_rtx_and_write_queues_empty(sk));
980 if (!skb)
981 goto wait_for_memory;
982
983 skb_entail(sk, skb);
984 copy = size_goal;
985 }
986
987 if (copy > size)
988 copy = size;
989
990 i = skb_shinfo(skb)->nr_frags;
991 can_coalesce = skb_can_coalesce(skb, i, page, offset);
992 if (!can_coalesce && i >= sysctl_max_skb_frags) {
993 tcp_mark_push(tp, skb);
994 goto new_segment;
995 }
996 if (!sk_wmem_schedule(sk, copy))
997 goto wait_for_memory;
998
999 if (can_coalesce) {
1000 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1001 } else {
1002 get_page(page);
1003 skb_fill_page_desc(skb, i, page, offset, copy);
1004 }
1005
1006 if (!(flags & MSG_NO_SHARED_FRAGS))
1007 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
1008
1009 skb->len += copy;
1010 skb->data_len += copy;
1011 skb->truesize += copy;
1012 sk->sk_wmem_queued += copy;
1013 sk_mem_charge(sk, copy);
1014 skb->ip_summed = CHECKSUM_PARTIAL;
1015 tp->write_seq += copy;
1016 TCP_SKB_CB(skb)->end_seq += copy;
1017 tcp_skb_pcount_set(skb, 0);
1018
1019 if (!copied)
1020 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1021
1022 copied += copy;
1023 offset += copy;
1024 size -= copy;
1025 if (!size)
1026 goto out;
1027
1028 if (skb->len < size_goal || (flags & MSG_OOB))
1029 continue;
1030
1031 if (forced_push(tp)) {
1032 tcp_mark_push(tp, skb);
1033 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1034 } else if (skb == tcp_send_head(sk))
1035 tcp_push_one(sk, mss_now);
1036 continue;
1037
1038wait_for_sndbuf:
1039 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1040wait_for_memory:
1041 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1042 TCP_NAGLE_PUSH, size_goal);
1043
1044 err = sk_stream_wait_memory(sk, &timeo);
1045 if (err != 0)
1046 goto do_error;
1047
1048 mss_now = tcp_send_mss(sk, &size_goal, flags);
1049 }
1050
1051out:
1052 if (copied) {
1053 tcp_tx_timestamp(sk, sk->sk_tsflags);
1054 if (!(flags & MSG_SENDPAGE_NOTLAST))
1055 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1056 }
1057 return copied;
1058
1059do_error:
1060 if (copied)
1061 goto out;
1062out_err:
1063
1064 if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
1065 err == -EAGAIN)) {
1066 sk->sk_write_space(sk);
1067 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1068 }
1069 return sk_stream_error(sk, flags, err);
1070}
1071EXPORT_SYMBOL_GPL(do_tcp_sendpages);
1072
1073int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
1074 size_t size, int flags)
1075{
1076 if (!(sk->sk_route_caps & NETIF_F_SG))
1077 return sock_no_sendpage_locked(sk, page, offset, size, flags);
1078
1079 tcp_rate_check_app_limited(sk);
1080
1081 return do_tcp_sendpages(sk, page, offset, size, flags);
1082}
1083EXPORT_SYMBOL_GPL(tcp_sendpage_locked);
1084
1085int tcp_sendpage(struct sock *sk, struct page *page, int offset,
1086 size_t size, int flags)
1087{
1088 int ret;
1089
1090 lock_sock(sk);
1091 ret = tcp_sendpage_locked(sk, page, offset, size, flags);
1092 release_sock(sk);
1093
1094 return ret;
1095}
1096EXPORT_SYMBOL(tcp_sendpage);
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108static int linear_payload_sz(bool first_skb)
1109{
1110 if (first_skb)
1111 return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
1112 return 0;
1113}
1114
1115static int select_size(bool first_skb, bool zc)
1116{
1117 if (zc)
1118 return 0;
1119 return linear_payload_sz(first_skb);
1120}
1121
1122void tcp_free_fastopen_req(struct tcp_sock *tp)
1123{
1124 if (tp->fastopen_req) {
1125 kfree(tp->fastopen_req);
1126 tp->fastopen_req = NULL;
1127 }
1128}
1129
1130static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1131 int *copied, size_t size)
1132{
1133 struct tcp_sock *tp = tcp_sk(sk);
1134 struct inet_sock *inet = inet_sk(sk);
1135 struct sockaddr *uaddr = msg->msg_name;
1136 int err, flags;
1137
1138 if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
1139 (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
1140 uaddr->sa_family == AF_UNSPEC))
1141 return -EOPNOTSUPP;
1142 if (tp->fastopen_req)
1143 return -EALREADY;
1144
1145 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1146 sk->sk_allocation);
1147 if (unlikely(!tp->fastopen_req))
1148 return -ENOBUFS;
1149 tp->fastopen_req->data = msg;
1150 tp->fastopen_req->size = size;
1151
1152 if (inet->defer_connect) {
1153 err = tcp_connect(sk);
1154
1155 if (err) {
1156 tcp_set_state(sk, TCP_CLOSE);
1157 inet->inet_dport = 0;
1158 sk->sk_route_caps = 0;
1159 }
1160 }
1161 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1162 err = __inet_stream_connect(sk->sk_socket, uaddr,
1163 msg->msg_namelen, flags, 1);
1164
1165
1166
1167 if (tp->fastopen_req) {
1168 *copied = tp->fastopen_req->copied;
1169 tcp_free_fastopen_req(tp);
1170 inet->defer_connect = 0;
1171 }
1172 return err;
1173}
1174
1175int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
1176{
1177 struct tcp_sock *tp = tcp_sk(sk);
1178 struct ubuf_info *uarg = NULL;
1179 struct sk_buff *skb;
1180 struct sockcm_cookie sockc;
1181 int flags, err, copied = 0;
1182 int mss_now = 0, size_goal, copied_syn = 0;
1183 bool process_backlog = false;
1184 bool zc = false;
1185 long timeo;
1186
1187 flags = msg->msg_flags;
1188
1189 if (flags & MSG_ZEROCOPY && size) {
1190 if (sk->sk_state != TCP_ESTABLISHED) {
1191 err = -EINVAL;
1192 goto out_err;
1193 }
1194
1195 skb = tcp_write_queue_tail(sk);
1196 uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
1197 if (!uarg) {
1198 err = -ENOBUFS;
1199 goto out_err;
1200 }
1201
1202 zc = sk->sk_route_caps & NETIF_F_SG;
1203 if (!zc)
1204 uarg->zerocopy = 0;
1205 }
1206
1207 if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
1208 !tp->repair) {
1209 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
1210 if (err == -EINPROGRESS && copied_syn > 0)
1211 goto out;
1212 else if (err)
1213 goto out_err;
1214 }
1215
1216 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1217
1218 tcp_rate_check_app_limited(sk);
1219
1220
1221
1222
1223
1224 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1225 !tcp_passive_fastopen(sk)) {
1226 err = sk_stream_wait_connect(sk, &timeo);
1227 if (err != 0)
1228 goto do_error;
1229 }
1230
1231 if (unlikely(tp->repair)) {
1232 if (tp->repair_queue == TCP_RECV_QUEUE) {
1233 copied = tcp_send_rcvq(sk, msg, size);
1234 goto out_nopush;
1235 }
1236
1237 err = -EINVAL;
1238 if (tp->repair_queue == TCP_NO_QUEUE)
1239 goto out_err;
1240
1241
1242 }
1243
1244 sockc.tsflags = sk->sk_tsflags;
1245 if (msg->msg_controllen) {
1246 err = sock_cmsg_send(sk, msg, &sockc);
1247 if (unlikely(err)) {
1248 err = -EINVAL;
1249 goto out_err;
1250 }
1251 }
1252
1253
1254 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1255
1256
1257 copied = 0;
1258
1259restart:
1260 mss_now = tcp_send_mss(sk, &size_goal, flags);
1261
1262 err = -EPIPE;
1263 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1264 goto do_error;
1265
1266 while (msg_data_left(msg)) {
1267 int copy = 0;
1268
1269 skb = tcp_write_queue_tail(sk);
1270 if (skb)
1271 copy = size_goal - skb->len;
1272
1273 if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
1274 bool first_skb;
1275 int linear;
1276
1277new_segment:
1278
1279
1280
1281 if (!sk_stream_memory_free(sk))
1282 goto wait_for_sndbuf;
1283
1284 if (process_backlog && sk_flush_backlog(sk)) {
1285 process_backlog = false;
1286 goto restart;
1287 }
1288 first_skb = tcp_rtx_and_write_queues_empty(sk);
1289 linear = select_size(first_skb, zc);
1290 skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation,
1291 first_skb);
1292 if (!skb)
1293 goto wait_for_memory;
1294
1295 process_backlog = true;
1296 skb->ip_summed = CHECKSUM_PARTIAL;
1297
1298 skb_entail(sk, skb);
1299 copy = size_goal;
1300
1301
1302
1303
1304
1305 if (tp->repair)
1306 TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
1307 }
1308
1309
1310 if (copy > msg_data_left(msg))
1311 copy = msg_data_left(msg);
1312
1313
1314 if (skb_availroom(skb) > 0 && !zc) {
1315
1316 copy = min_t(int, copy, skb_availroom(skb));
1317 err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
1318 if (err)
1319 goto do_fault;
1320 } else if (!zc) {
1321 bool merge = true;
1322 int i = skb_shinfo(skb)->nr_frags;
1323 struct page_frag *pfrag = sk_page_frag(sk);
1324
1325 if (!sk_page_frag_refill(sk, pfrag))
1326 goto wait_for_memory;
1327
1328 if (!skb_can_coalesce(skb, i, pfrag->page,
1329 pfrag->offset)) {
1330 if (i >= sysctl_max_skb_frags) {
1331 tcp_mark_push(tp, skb);
1332 goto new_segment;
1333 }
1334 merge = false;
1335 }
1336
1337 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1338
1339 if (!sk_wmem_schedule(sk, copy))
1340 goto wait_for_memory;
1341
1342 err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
1343 pfrag->page,
1344 pfrag->offset,
1345 copy);
1346 if (err)
1347 goto do_error;
1348
1349
1350 if (merge) {
1351 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1352 } else {
1353 skb_fill_page_desc(skb, i, pfrag->page,
1354 pfrag->offset, copy);
1355 page_ref_inc(pfrag->page);
1356 }
1357 pfrag->offset += copy;
1358 } else {
1359 err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
1360 if (err == -EMSGSIZE || err == -EEXIST) {
1361 tcp_mark_push(tp, skb);
1362 goto new_segment;
1363 }
1364 if (err < 0)
1365 goto do_error;
1366 copy = err;
1367 }
1368
1369 if (!copied)
1370 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1371
1372 tp->write_seq += copy;
1373 TCP_SKB_CB(skb)->end_seq += copy;
1374 tcp_skb_pcount_set(skb, 0);
1375
1376 copied += copy;
1377 if (!msg_data_left(msg)) {
1378 if (unlikely(flags & MSG_EOR))
1379 TCP_SKB_CB(skb)->eor = 1;
1380 goto out;
1381 }
1382
1383 if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair))
1384 continue;
1385
1386 if (forced_push(tp)) {
1387 tcp_mark_push(tp, skb);
1388 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1389 } else if (skb == tcp_send_head(sk))
1390 tcp_push_one(sk, mss_now);
1391 continue;
1392
1393wait_for_sndbuf:
1394 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1395wait_for_memory:
1396 if (copied)
1397 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1398 TCP_NAGLE_PUSH, size_goal);
1399
1400 err = sk_stream_wait_memory(sk, &timeo);
1401 if (err != 0)
1402 goto do_error;
1403
1404 mss_now = tcp_send_mss(sk, &size_goal, flags);
1405 }
1406
1407out:
1408 if (copied) {
1409 tcp_tx_timestamp(sk, sockc.tsflags);
1410 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1411 }
1412out_nopush:
1413 sock_zerocopy_put(uarg);
1414 return copied + copied_syn;
1415
1416do_fault:
1417 if (!skb->len) {
1418 tcp_unlink_write_queue(skb, sk);
1419
1420
1421
1422 tcp_check_send_head(sk, skb);
1423 sk_wmem_free_skb(sk, skb);
1424 }
1425
1426do_error:
1427 if (copied + copied_syn)
1428 goto out;
1429out_err:
1430 sock_zerocopy_put_abort(uarg);
1431 err = sk_stream_error(sk, flags, err);
1432
1433 if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
1434 err == -EAGAIN)) {
1435 sk->sk_write_space(sk);
1436 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1437 }
1438 return err;
1439}
1440EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
1441
1442int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
1443{
1444 int ret;
1445
1446 lock_sock(sk);
1447 ret = tcp_sendmsg_locked(sk, msg, size);
1448 release_sock(sk);
1449
1450 return ret;
1451}
1452EXPORT_SYMBOL(tcp_sendmsg);
1453
1454
1455
1456
1457
1458
1459static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1460{
1461 struct tcp_sock *tp = tcp_sk(sk);
1462
1463
1464 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1465 tp->urg_data == TCP_URG_READ)
1466 return -EINVAL;
1467
1468 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1469 return -ENOTCONN;
1470
1471 if (tp->urg_data & TCP_URG_VALID) {
1472 int err = 0;
1473 char c = tp->urg_data;
1474
1475 if (!(flags & MSG_PEEK))
1476 tp->urg_data = TCP_URG_READ;
1477
1478
1479 msg->msg_flags |= MSG_OOB;
1480
1481 if (len > 0) {
1482 if (!(flags & MSG_TRUNC))
1483 err = memcpy_to_msg(msg, &c, 1);
1484 len = 1;
1485 } else
1486 msg->msg_flags |= MSG_TRUNC;
1487
1488 return err ? -EFAULT : len;
1489 }
1490
1491 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1492 return 0;
1493
1494
1495
1496
1497
1498
1499
1500 return -EAGAIN;
1501}
1502
1503static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1504{
1505 struct sk_buff *skb;
1506 int copied = 0, err = 0;
1507
1508
1509
1510 skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
1511 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1512 if (err)
1513 return err;
1514 copied += skb->len;
1515 }
1516
1517 skb_queue_walk(&sk->sk_write_queue, skb) {
1518 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1519 if (err)
1520 break;
1521
1522 copied += skb->len;
1523 }
1524
1525 return err ?: copied;
1526}
1527
1528
1529
1530
1531
1532
1533
1534static void tcp_cleanup_rbuf(struct sock *sk, int copied)
1535{
1536 struct tcp_sock *tp = tcp_sk(sk);
1537 bool time_to_ack = false;
1538
1539 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1540
1541 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1542 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1543 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1544
1545 if (inet_csk_ack_scheduled(sk)) {
1546 const struct inet_connection_sock *icsk = inet_csk(sk);
1547
1548
1549 if (icsk->icsk_ack.blocked ||
1550
1551 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1552
1553
1554
1555
1556
1557
1558 (copied > 0 &&
1559 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1560 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1561 !icsk->icsk_ack.pingpong)) &&
1562 !atomic_read(&sk->sk_rmem_alloc)))
1563 time_to_ack = true;
1564 }
1565
1566
1567
1568
1569
1570
1571
1572 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1573 __u32 rcv_window_now = tcp_receive_window(tp);
1574
1575
1576 if (2*rcv_window_now <= tp->window_clamp) {
1577 __u32 new_window = __tcp_select_window(sk);
1578
1579
1580
1581
1582
1583
1584 if (new_window && new_window >= 2 * rcv_window_now)
1585 time_to_ack = true;
1586 }
1587 }
1588 if (time_to_ack)
1589 tcp_send_ack(sk);
1590}
1591
1592static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1593{
1594 struct sk_buff *skb;
1595 u32 offset;
1596
1597 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1598 offset = seq - TCP_SKB_CB(skb)->seq;
1599 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
1600 pr_err_once("%s: found a SYN, please report !\n", __func__);
1601 offset--;
1602 }
1603 if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
1604 *off = offset;
1605 return skb;
1606 }
1607
1608
1609
1610
1611 sk_eat_skb(sk, skb);
1612 }
1613 return NULL;
1614}
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1628 sk_read_actor_t recv_actor)
1629{
1630 struct sk_buff *skb;
1631 struct tcp_sock *tp = tcp_sk(sk);
1632 u32 seq = tp->copied_seq;
1633 u32 offset;
1634 int copied = 0;
1635
1636 if (sk->sk_state == TCP_LISTEN)
1637 return -ENOTCONN;
1638 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1639 if (offset < skb->len) {
1640 int used;
1641 size_t len;
1642
1643 len = skb->len - offset;
1644
1645 if (tp->urg_data) {
1646 u32 urg_offset = tp->urg_seq - seq;
1647 if (urg_offset < len)
1648 len = urg_offset;
1649 if (!len)
1650 break;
1651 }
1652 used = recv_actor(desc, skb, offset, len);
1653 if (used <= 0) {
1654 if (!copied)
1655 copied = used;
1656 break;
1657 } else if (used <= len) {
1658 seq += used;
1659 copied += used;
1660 offset += used;
1661 }
1662
1663
1664
1665
1666
1667 skb = tcp_recv_skb(sk, seq - 1, &offset);
1668 if (!skb)
1669 break;
1670
1671
1672
1673 if (offset + 1 != skb->len)
1674 continue;
1675 }
1676 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1677 sk_eat_skb(sk, skb);
1678 ++seq;
1679 break;
1680 }
1681 sk_eat_skb(sk, skb);
1682 if (!desc->count)
1683 break;
1684 tp->copied_seq = seq;
1685 }
1686 tp->copied_seq = seq;
1687
1688 tcp_rcv_space_adjust(sk);
1689
1690
1691 if (copied > 0) {
1692 tcp_recv_skb(sk, seq, &offset);
1693 tcp_cleanup_rbuf(sk, copied);
1694 }
1695 return copied;
1696}
1697EXPORT_SYMBOL(tcp_read_sock);
1698
1699int tcp_peek_len(struct socket *sock)
1700{
1701 return tcp_inq(sock->sk);
1702}
1703EXPORT_SYMBOL(tcp_peek_len);
1704
1705
1706int tcp_set_rcvlowat(struct sock *sk, int val)
1707{
1708 int cap;
1709
1710 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1711 cap = sk->sk_rcvbuf >> 1;
1712 else
1713 cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1;
1714 val = min(val, cap);
1715 sk->sk_rcvlowat = val ? : 1;
1716
1717
1718 tcp_data_ready(sk);
1719
1720 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1721 return 0;
1722
1723 val <<= 1;
1724 if (val > sk->sk_rcvbuf) {
1725 sk->sk_rcvbuf = val;
1726 tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
1727 }
1728 return 0;
1729}
1730EXPORT_SYMBOL(tcp_set_rcvlowat);
1731
1732#ifdef CONFIG_MMU
1733static const struct vm_operations_struct tcp_vm_ops = {
1734};
1735
1736int tcp_mmap(struct file *file, struct socket *sock,
1737 struct vm_area_struct *vma)
1738{
1739 if (vma->vm_flags & (VM_WRITE | VM_EXEC))
1740 return -EPERM;
1741 vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
1742
1743
1744 vma->vm_flags |= VM_MIXEDMAP;
1745
1746 vma->vm_ops = &tcp_vm_ops;
1747 return 0;
1748}
1749EXPORT_SYMBOL(tcp_mmap);
1750
1751static int tcp_zerocopy_receive(struct sock *sk,
1752 struct tcp_zerocopy_receive *zc)
1753{
1754 unsigned long address = (unsigned long)zc->address;
1755 const skb_frag_t *frags = NULL;
1756 u32 length = 0, seq, offset;
1757 struct vm_area_struct *vma;
1758 struct sk_buff *skb = NULL;
1759 struct tcp_sock *tp;
1760 int ret;
1761
1762 if (address & (PAGE_SIZE - 1) || address != zc->address)
1763 return -EINVAL;
1764
1765 if (sk->sk_state == TCP_LISTEN)
1766 return -ENOTCONN;
1767
1768 sock_rps_record_flow(sk);
1769
1770 down_read(¤t->mm->mmap_sem);
1771
1772 ret = -EINVAL;
1773 vma = find_vma(current->mm, address);
1774 if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops)
1775 goto out;
1776 zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
1777
1778 tp = tcp_sk(sk);
1779 seq = tp->copied_seq;
1780 zc->length = min_t(u32, zc->length, tcp_inq(sk));
1781 zc->length &= ~(PAGE_SIZE - 1);
1782
1783 zap_page_range(vma, address, zc->length);
1784
1785 zc->recv_skip_hint = 0;
1786 ret = 0;
1787 while (length + PAGE_SIZE <= zc->length) {
1788 if (zc->recv_skip_hint < PAGE_SIZE) {
1789 if (skb) {
1790 skb = skb->next;
1791 offset = seq - TCP_SKB_CB(skb)->seq;
1792 } else {
1793 skb = tcp_recv_skb(sk, seq, &offset);
1794 }
1795
1796 zc->recv_skip_hint = skb->len - offset;
1797 offset -= skb_headlen(skb);
1798 if ((int)offset < 0 || skb_has_frag_list(skb))
1799 break;
1800 frags = skb_shinfo(skb)->frags;
1801 while (offset) {
1802 if (frags->size > offset)
1803 goto out;
1804 offset -= frags->size;
1805 frags++;
1806 }
1807 }
1808 if (frags->size != PAGE_SIZE || frags->page_offset)
1809 break;
1810 ret = vm_insert_page(vma, address + length,
1811 skb_frag_page(frags));
1812 if (ret)
1813 break;
1814 length += PAGE_SIZE;
1815 seq += PAGE_SIZE;
1816 zc->recv_skip_hint -= PAGE_SIZE;
1817 frags++;
1818 }
1819out:
1820 up_read(¤t->mm->mmap_sem);
1821 if (length) {
1822 tp->copied_seq = seq;
1823 tcp_rcv_space_adjust(sk);
1824
1825
1826 tcp_recv_skb(sk, seq, &offset);
1827 tcp_cleanup_rbuf(sk, length);
1828 ret = 0;
1829 if (length == zc->length)
1830 zc->recv_skip_hint = 0;
1831 } else {
1832 if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
1833 ret = -EIO;
1834 }
1835 zc->length = length;
1836 return ret;
1837}
1838#endif
1839
1840static void tcp_update_recv_tstamps(struct sk_buff *skb,
1841 struct scm_timestamping *tss)
1842{
1843 if (skb->tstamp)
1844 tss->ts[0] = ktime_to_timespec(skb->tstamp);
1845 else
1846 tss->ts[0] = (struct timespec) {0};
1847
1848 if (skb_hwtstamps(skb)->hwtstamp)
1849 tss->ts[2] = ktime_to_timespec(skb_hwtstamps(skb)->hwtstamp);
1850 else
1851 tss->ts[2] = (struct timespec) {0};
1852}
1853
1854
1855static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
1856 struct scm_timestamping *tss)
1857{
1858 struct timeval tv;
1859 bool has_timestamping = false;
1860
1861 if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
1862 if (sock_flag(sk, SOCK_RCVTSTAMP)) {
1863 if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
1864 put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS,
1865 sizeof(tss->ts[0]), &tss->ts[0]);
1866 } else {
1867 tv.tv_sec = tss->ts[0].tv_sec;
1868 tv.tv_usec = tss->ts[0].tv_nsec / 1000;
1869
1870 put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
1871 sizeof(tv), &tv);
1872 }
1873 }
1874
1875 if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
1876 has_timestamping = true;
1877 else
1878 tss->ts[0] = (struct timespec) {0};
1879 }
1880
1881 if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
1882 if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
1883 has_timestamping = true;
1884 else
1885 tss->ts[2] = (struct timespec) {0};
1886 }
1887
1888 if (has_timestamping) {
1889 tss->ts[1] = (struct timespec) {0};
1890 put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING,
1891 sizeof(*tss), tss);
1892 }
1893}
1894
1895static int tcp_inq_hint(struct sock *sk)
1896{
1897 const struct tcp_sock *tp = tcp_sk(sk);
1898 u32 copied_seq = READ_ONCE(tp->copied_seq);
1899 u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
1900 int inq;
1901
1902 inq = rcv_nxt - copied_seq;
1903 if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
1904 lock_sock(sk);
1905 inq = tp->rcv_nxt - tp->copied_seq;
1906 release_sock(sk);
1907 }
1908 return inq;
1909}
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
1920 int flags, int *addr_len)
1921{
1922 struct tcp_sock *tp = tcp_sk(sk);
1923 int copied = 0;
1924 u32 peek_seq;
1925 u32 *seq;
1926 unsigned long used;
1927 int err, inq;
1928 int target;
1929 long timeo;
1930 struct sk_buff *skb, *last;
1931 u32 urg_hole = 0;
1932 struct scm_timestamping tss;
1933 bool has_tss = false;
1934 bool has_cmsg;
1935
1936 if (unlikely(flags & MSG_ERRQUEUE))
1937 return inet_recv_error(sk, msg, len, addr_len);
1938
1939 if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
1940 (sk->sk_state == TCP_ESTABLISHED))
1941 sk_busy_loop(sk, nonblock);
1942
1943 lock_sock(sk);
1944
1945 err = -ENOTCONN;
1946 if (sk->sk_state == TCP_LISTEN)
1947 goto out;
1948
1949 has_cmsg = tp->recvmsg_inq;
1950 timeo = sock_rcvtimeo(sk, nonblock);
1951
1952
1953 if (flags & MSG_OOB)
1954 goto recv_urg;
1955
1956 if (unlikely(tp->repair)) {
1957 err = -EPERM;
1958 if (!(flags & MSG_PEEK))
1959 goto out;
1960
1961 if (tp->repair_queue == TCP_SEND_QUEUE)
1962 goto recv_sndq;
1963
1964 err = -EINVAL;
1965 if (tp->repair_queue == TCP_NO_QUEUE)
1966 goto out;
1967
1968
1969 }
1970
1971 seq = &tp->copied_seq;
1972 if (flags & MSG_PEEK) {
1973 peek_seq = tp->copied_seq;
1974 seq = &peek_seq;
1975 }
1976
1977 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1978
1979 do {
1980 u32 offset;
1981
1982
1983 if (tp->urg_data && tp->urg_seq == *seq) {
1984 if (copied)
1985 break;
1986 if (signal_pending(current)) {
1987 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1988 break;
1989 }
1990 }
1991
1992
1993
1994 last = skb_peek_tail(&sk->sk_receive_queue);
1995 skb_queue_walk(&sk->sk_receive_queue, skb) {
1996 last = skb;
1997
1998
1999
2000 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
2001 "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
2002 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
2003 flags))
2004 break;
2005
2006 offset = *seq - TCP_SKB_CB(skb)->seq;
2007 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
2008 pr_err_once("%s: found a SYN, please report !\n", __func__);
2009 offset--;
2010 }
2011 if (offset < skb->len)
2012 goto found_ok_skb;
2013 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2014 goto found_fin_ok;
2015 WARN(!(flags & MSG_PEEK),
2016 "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
2017 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
2018 }
2019
2020
2021
2022 if (copied >= target && !sk->sk_backlog.tail)
2023 break;
2024
2025 if (copied) {
2026 if (sk->sk_err ||
2027 sk->sk_state == TCP_CLOSE ||
2028 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2029 !timeo ||
2030 signal_pending(current))
2031 break;
2032 } else {
2033 if (sock_flag(sk, SOCK_DONE))
2034 break;
2035
2036 if (sk->sk_err) {
2037 copied = sock_error(sk);
2038 break;
2039 }
2040
2041 if (sk->sk_shutdown & RCV_SHUTDOWN)
2042 break;
2043
2044 if (sk->sk_state == TCP_CLOSE) {
2045 if (!sock_flag(sk, SOCK_DONE)) {
2046
2047
2048
2049 copied = -ENOTCONN;
2050 break;
2051 }
2052 break;
2053 }
2054
2055 if (!timeo) {
2056 copied = -EAGAIN;
2057 break;
2058 }
2059
2060 if (signal_pending(current)) {
2061 copied = sock_intr_errno(timeo);
2062 break;
2063 }
2064 }
2065
2066 tcp_cleanup_rbuf(sk, copied);
2067
2068 if (copied >= target) {
2069
2070 release_sock(sk);
2071 lock_sock(sk);
2072 } else {
2073 sk_wait_data(sk, &timeo, last);
2074 }
2075
2076 if ((flags & MSG_PEEK) &&
2077 (peek_seq - copied - urg_hole != tp->copied_seq)) {
2078 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
2079 current->comm,
2080 task_pid_nr(current));
2081 peek_seq = tp->copied_seq;
2082 }
2083 continue;
2084
2085 found_ok_skb:
2086
2087 used = skb->len - offset;
2088 if (len < used)
2089 used = len;
2090
2091
2092 if (tp->urg_data) {
2093 u32 urg_offset = tp->urg_seq - *seq;
2094 if (urg_offset < used) {
2095 if (!urg_offset) {
2096 if (!sock_flag(sk, SOCK_URGINLINE)) {
2097 ++*seq;
2098 urg_hole++;
2099 offset++;
2100 used--;
2101 if (!used)
2102 goto skip_copy;
2103 }
2104 } else
2105 used = urg_offset;
2106 }
2107 }
2108
2109 if (!(flags & MSG_TRUNC)) {
2110 err = skb_copy_datagram_msg(skb, offset, msg, used);
2111 if (err) {
2112
2113 if (!copied)
2114 copied = -EFAULT;
2115 break;
2116 }
2117 }
2118
2119 *seq += used;
2120 copied += used;
2121 len -= used;
2122
2123 tcp_rcv_space_adjust(sk);
2124
2125skip_copy:
2126 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
2127 tp->urg_data = 0;
2128 tcp_fast_path_check(sk);
2129 }
2130 if (used + offset < skb->len)
2131 continue;
2132
2133 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2134 tcp_update_recv_tstamps(skb, &tss);
2135 has_tss = true;
2136 has_cmsg = true;
2137 }
2138 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2139 goto found_fin_ok;
2140 if (!(flags & MSG_PEEK))
2141 sk_eat_skb(sk, skb);
2142 continue;
2143
2144 found_fin_ok:
2145
2146 ++*seq;
2147 if (!(flags & MSG_PEEK))
2148 sk_eat_skb(sk, skb);
2149 break;
2150 } while (len > 0);
2151
2152
2153
2154
2155
2156
2157 tcp_cleanup_rbuf(sk, copied);
2158
2159 release_sock(sk);
2160
2161 if (has_cmsg) {
2162 if (has_tss)
2163 tcp_recv_timestamp(msg, sk, &tss);
2164 if (tp->recvmsg_inq) {
2165 inq = tcp_inq_hint(sk);
2166 put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
2167 }
2168 }
2169
2170 return copied;
2171
2172out:
2173 release_sock(sk);
2174 return err;
2175
2176recv_urg:
2177 err = tcp_recv_urg(sk, msg, len, flags);
2178 goto out;
2179
2180recv_sndq:
2181 err = tcp_peek_sndq(sk, msg, len);
2182 goto out;
2183}
2184EXPORT_SYMBOL(tcp_recvmsg);
2185
2186void tcp_set_state(struct sock *sk, int state)
2187{
2188 int oldstate = sk->sk_state;
2189
2190
2191
2192
2193
2194
2195
2196
2197 BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
2198 BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
2199 BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
2200 BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
2201 BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
2202 BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
2203 BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
2204 BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
2205 BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
2206 BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
2207 BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
2208 BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
2209 BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
2210
2211 if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
2212 tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
2213
2214 switch (state) {
2215 case TCP_ESTABLISHED:
2216 if (oldstate != TCP_ESTABLISHED)
2217 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2218 break;
2219
2220 case TCP_CLOSE:
2221 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
2222 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
2223
2224 sk->sk_prot->unhash(sk);
2225 if (inet_csk(sk)->icsk_bind_hash &&
2226 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
2227 inet_put_port(sk);
2228
2229 default:
2230 if (oldstate == TCP_ESTABLISHED)
2231 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2232 }
2233
2234
2235
2236
2237 inet_sk_state_store(sk, state);
2238
2239#ifdef STATE_TRACE
2240 SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
2241#endif
2242}
2243EXPORT_SYMBOL_GPL(tcp_set_state);
2244
2245
2246
2247
2248
2249
2250
2251
2252static const unsigned char new_state[16] = {
2253
2254 [0 ] = TCP_CLOSE,
2255 [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2256 [TCP_SYN_SENT] = TCP_CLOSE,
2257 [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2258 [TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
2259 [TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
2260 [TCP_TIME_WAIT] = TCP_CLOSE,
2261 [TCP_CLOSE] = TCP_CLOSE,
2262 [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
2263 [TCP_LAST_ACK] = TCP_LAST_ACK,
2264 [TCP_LISTEN] = TCP_CLOSE,
2265 [TCP_CLOSING] = TCP_CLOSING,
2266 [TCP_NEW_SYN_RECV] = TCP_CLOSE,
2267};
2268
2269static int tcp_close_state(struct sock *sk)
2270{
2271 int next = (int)new_state[sk->sk_state];
2272 int ns = next & TCP_STATE_MASK;
2273
2274 tcp_set_state(sk, ns);
2275
2276 return next & TCP_ACTION_FIN;
2277}
2278
2279
2280
2281
2282
2283
2284void tcp_shutdown(struct sock *sk, int how)
2285{
2286
2287
2288
2289
2290 if (!(how & SEND_SHUTDOWN))
2291 return;
2292
2293
2294 if ((1 << sk->sk_state) &
2295 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2296 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2297
2298 if (tcp_close_state(sk))
2299 tcp_send_fin(sk);
2300 }
2301}
2302EXPORT_SYMBOL(tcp_shutdown);
2303
2304bool tcp_check_oom(struct sock *sk, int shift)
2305{
2306 bool too_many_orphans, out_of_socket_memory;
2307
2308 too_many_orphans = tcp_too_many_orphans(sk, shift);
2309 out_of_socket_memory = tcp_out_of_memory(sk);
2310
2311 if (too_many_orphans)
2312 net_info_ratelimited("too many orphaned sockets\n");
2313 if (out_of_socket_memory)
2314 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2315 return too_many_orphans || out_of_socket_memory;
2316}
2317
2318void tcp_close(struct sock *sk, long timeout)
2319{
2320 struct sk_buff *skb;
2321 int data_was_unread = 0;
2322 int state;
2323
2324 lock_sock(sk);
2325 sk->sk_shutdown = SHUTDOWN_MASK;
2326
2327 if (sk->sk_state == TCP_LISTEN) {
2328 tcp_set_state(sk, TCP_CLOSE);
2329
2330
2331 inet_csk_listen_stop(sk);
2332
2333 goto adjudge_to_death;
2334 }
2335
2336
2337
2338
2339
2340 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2341 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
2342
2343 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2344 len--;
2345 data_was_unread += len;
2346 __kfree_skb(skb);
2347 }
2348
2349 sk_mem_reclaim(sk);
2350
2351
2352 if (sk->sk_state == TCP_CLOSE)
2353 goto adjudge_to_death;
2354
2355
2356
2357
2358
2359
2360
2361
2362 if (unlikely(tcp_sk(sk)->repair)) {
2363 sk->sk_prot->disconnect(sk, 0);
2364 } else if (data_was_unread) {
2365
2366 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2367 tcp_set_state(sk, TCP_CLOSE);
2368 tcp_send_active_reset(sk, sk->sk_allocation);
2369 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2370
2371 sk->sk_prot->disconnect(sk, 0);
2372 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2373 } else if (tcp_close_state(sk)) {
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403 tcp_send_fin(sk);
2404 }
2405
2406 sk_stream_wait_close(sk, timeout);
2407
2408adjudge_to_death:
2409 state = sk->sk_state;
2410 sock_hold(sk);
2411 sock_orphan(sk);
2412
2413
2414 release_sock(sk);
2415
2416
2417
2418
2419
2420 local_bh_disable();
2421 bh_lock_sock(sk);
2422 WARN_ON(sock_owned_by_user(sk));
2423
2424 percpu_counter_inc(sk->sk_prot->orphan_count);
2425
2426
2427 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2428 goto out;
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444 if (sk->sk_state == TCP_FIN_WAIT2) {
2445 struct tcp_sock *tp = tcp_sk(sk);
2446 if (tp->linger2 < 0) {
2447 tcp_set_state(sk, TCP_CLOSE);
2448 tcp_send_active_reset(sk, GFP_ATOMIC);
2449 __NET_INC_STATS(sock_net(sk),
2450 LINUX_MIB_TCPABORTONLINGER);
2451 } else {
2452 const int tmo = tcp_fin_time(sk);
2453
2454 if (tmo > TCP_TIMEWAIT_LEN) {
2455 inet_csk_reset_keepalive_timer(sk,
2456 tmo - TCP_TIMEWAIT_LEN);
2457 } else {
2458 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2459 goto out;
2460 }
2461 }
2462 }
2463 if (sk->sk_state != TCP_CLOSE) {
2464 sk_mem_reclaim(sk);
2465 if (tcp_check_oom(sk, 0)) {
2466 tcp_set_state(sk, TCP_CLOSE);
2467 tcp_send_active_reset(sk, GFP_ATOMIC);
2468 __NET_INC_STATS(sock_net(sk),
2469 LINUX_MIB_TCPABORTONMEMORY);
2470 } else if (!check_net(sock_net(sk))) {
2471
2472 tcp_set_state(sk, TCP_CLOSE);
2473 }
2474 }
2475
2476 if (sk->sk_state == TCP_CLOSE) {
2477 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2478
2479
2480
2481
2482 if (req)
2483 reqsk_fastopen_remove(sk, req, false);
2484 inet_csk_destroy_sock(sk);
2485 }
2486
2487
2488out:
2489 bh_unlock_sock(sk);
2490 local_bh_enable();
2491 sock_put(sk);
2492}
2493EXPORT_SYMBOL(tcp_close);
2494
2495
2496
2497static inline bool tcp_need_reset(int state)
2498{
2499 return (1 << state) &
2500 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2501 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2502}
2503
2504static void tcp_rtx_queue_purge(struct sock *sk)
2505{
2506 struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
2507
2508 while (p) {
2509 struct sk_buff *skb = rb_to_skb(p);
2510
2511 p = rb_next(p);
2512
2513
2514
2515 tcp_rtx_queue_unlink(skb, sk);
2516 sk_wmem_free_skb(sk, skb);
2517 }
2518}
2519
2520void tcp_write_queue_purge(struct sock *sk)
2521{
2522 struct sk_buff *skb;
2523
2524 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
2525 while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
2526 tcp_skb_tsorted_anchor_cleanup(skb);
2527 sk_wmem_free_skb(sk, skb);
2528 }
2529 tcp_rtx_queue_purge(sk);
2530 INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
2531 sk_mem_reclaim(sk);
2532 tcp_clear_all_retrans_hints(tcp_sk(sk));
2533 tcp_sk(sk)->packets_out = 0;
2534}
2535
2536int tcp_disconnect(struct sock *sk, int flags)
2537{
2538 struct inet_sock *inet = inet_sk(sk);
2539 struct inet_connection_sock *icsk = inet_csk(sk);
2540 struct tcp_sock *tp = tcp_sk(sk);
2541 int err = 0;
2542 int old_state = sk->sk_state;
2543
2544 if (old_state != TCP_CLOSE)
2545 tcp_set_state(sk, TCP_CLOSE);
2546
2547
2548 if (old_state == TCP_LISTEN) {
2549 inet_csk_listen_stop(sk);
2550 } else if (unlikely(tp->repair)) {
2551 sk->sk_err = ECONNABORTED;
2552 } else if (tcp_need_reset(old_state) ||
2553 (tp->snd_nxt != tp->write_seq &&
2554 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2555
2556
2557
2558 tcp_send_active_reset(sk, gfp_any());
2559 sk->sk_err = ECONNRESET;
2560 } else if (old_state == TCP_SYN_SENT)
2561 sk->sk_err = ECONNRESET;
2562
2563 tcp_clear_xmit_timers(sk);
2564 __skb_queue_purge(&sk->sk_receive_queue);
2565 tp->copied_seq = tp->rcv_nxt;
2566 tp->urg_data = 0;
2567 tcp_write_queue_purge(sk);
2568 tcp_fastopen_active_disable_ofo_check(sk);
2569 skb_rbtree_purge(&tp->out_of_order_queue);
2570
2571 inet->inet_dport = 0;
2572
2573 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2574 inet_reset_saddr(sk);
2575
2576 sk->sk_shutdown = 0;
2577 sock_reset_flag(sk, SOCK_DONE);
2578 tp->srtt_us = 0;
2579 tp->write_seq += tp->max_window + 2;
2580 if (tp->write_seq == 0)
2581 tp->write_seq = 1;
2582 icsk->icsk_backoff = 0;
2583 tp->snd_cwnd = 2;
2584 icsk->icsk_probes_out = 0;
2585 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2586 tp->snd_cwnd_cnt = 0;
2587 tp->window_clamp = 0;
2588 tp->delivered_ce = 0;
2589 tcp_set_ca_state(sk, TCP_CA_Open);
2590 tp->is_sack_reneg = 0;
2591 tcp_clear_retrans(tp);
2592 inet_csk_delack_init(sk);
2593
2594
2595
2596 icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
2597 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2598 __sk_dst_reset(sk);
2599 dst_release(sk->sk_rx_dst);
2600 sk->sk_rx_dst = NULL;
2601 tcp_saved_syn_free(tp);
2602 tp->compressed_ack = 0;
2603
2604
2605 tcp_free_fastopen_req(tp);
2606 inet->defer_connect = 0;
2607
2608 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2609
2610 if (sk->sk_frag.page) {
2611 put_page(sk->sk_frag.page);
2612 sk->sk_frag.page = NULL;
2613 sk->sk_frag.offset = 0;
2614 }
2615
2616 sk->sk_error_report(sk);
2617 return err;
2618}
2619EXPORT_SYMBOL(tcp_disconnect);
2620
2621static inline bool tcp_can_repair_sock(const struct sock *sk)
2622{
2623 return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
2624 (sk->sk_state != TCP_LISTEN);
2625}
2626
2627static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len)
2628{
2629 struct tcp_repair_window opt;
2630
2631 if (!tp->repair)
2632 return -EPERM;
2633
2634 if (len != sizeof(opt))
2635 return -EINVAL;
2636
2637 if (copy_from_user(&opt, optbuf, sizeof(opt)))
2638 return -EFAULT;
2639
2640 if (opt.max_window < opt.snd_wnd)
2641 return -EINVAL;
2642
2643 if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
2644 return -EINVAL;
2645
2646 if (after(opt.rcv_wup, tp->rcv_nxt))
2647 return -EINVAL;
2648
2649 tp->snd_wl1 = opt.snd_wl1;
2650 tp->snd_wnd = opt.snd_wnd;
2651 tp->max_window = opt.max_window;
2652
2653 tp->rcv_wnd = opt.rcv_wnd;
2654 tp->rcv_wup = opt.rcv_wup;
2655
2656 return 0;
2657}
2658
2659static int tcp_repair_options_est(struct sock *sk,
2660 struct tcp_repair_opt __user *optbuf, unsigned int len)
2661{
2662 struct tcp_sock *tp = tcp_sk(sk);
2663 struct tcp_repair_opt opt;
2664
2665 while (len >= sizeof(opt)) {
2666 if (copy_from_user(&opt, optbuf, sizeof(opt)))
2667 return -EFAULT;
2668
2669 optbuf++;
2670 len -= sizeof(opt);
2671
2672 switch (opt.opt_code) {
2673 case TCPOPT_MSS:
2674 tp->rx_opt.mss_clamp = opt.opt_val;
2675 tcp_mtup_init(sk);
2676 break;
2677 case TCPOPT_WINDOW:
2678 {
2679 u16 snd_wscale = opt.opt_val & 0xFFFF;
2680 u16 rcv_wscale = opt.opt_val >> 16;
2681
2682 if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE)
2683 return -EFBIG;
2684
2685 tp->rx_opt.snd_wscale = snd_wscale;
2686 tp->rx_opt.rcv_wscale = rcv_wscale;
2687 tp->rx_opt.wscale_ok = 1;
2688 }
2689 break;
2690 case TCPOPT_SACK_PERM:
2691 if (opt.opt_val != 0)
2692 return -EINVAL;
2693
2694 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2695 break;
2696 case TCPOPT_TIMESTAMP:
2697 if (opt.opt_val != 0)
2698 return -EINVAL;
2699
2700 tp->rx_opt.tstamp_ok = 1;
2701 break;
2702 }
2703 }
2704
2705 return 0;
2706}
2707
2708
2709
2710
2711static int do_tcp_setsockopt(struct sock *sk, int level,
2712 int optname, char __user *optval, unsigned int optlen)
2713{
2714 struct tcp_sock *tp = tcp_sk(sk);
2715 struct inet_connection_sock *icsk = inet_csk(sk);
2716 struct net *net = sock_net(sk);
2717 int val;
2718 int err = 0;
2719
2720
2721 switch (optname) {
2722 case TCP_CONGESTION: {
2723 char name[TCP_CA_NAME_MAX];
2724
2725 if (optlen < 1)
2726 return -EINVAL;
2727
2728 val = strncpy_from_user(name, optval,
2729 min_t(long, TCP_CA_NAME_MAX-1, optlen));
2730 if (val < 0)
2731 return -EFAULT;
2732 name[val] = 0;
2733
2734 lock_sock(sk);
2735 err = tcp_set_congestion_control(sk, name, true, true);
2736 release_sock(sk);
2737 return err;
2738 }
2739 case TCP_ULP: {
2740 char name[TCP_ULP_NAME_MAX];
2741
2742 if (optlen < 1)
2743 return -EINVAL;
2744
2745 val = strncpy_from_user(name, optval,
2746 min_t(long, TCP_ULP_NAME_MAX - 1,
2747 optlen));
2748 if (val < 0)
2749 return -EFAULT;
2750 name[val] = 0;
2751
2752 lock_sock(sk);
2753 err = tcp_set_ulp(sk, name);
2754 release_sock(sk);
2755 return err;
2756 }
2757 case TCP_FASTOPEN_KEY: {
2758 __u8 key[TCP_FASTOPEN_KEY_LENGTH];
2759
2760 if (optlen != sizeof(key))
2761 return -EINVAL;
2762
2763 if (copy_from_user(key, optval, optlen))
2764 return -EFAULT;
2765
2766 return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key));
2767 }
2768 default:
2769
2770 break;
2771 }
2772
2773 if (optlen < sizeof(int))
2774 return -EINVAL;
2775
2776 if (get_user(val, (int __user *)optval))
2777 return -EFAULT;
2778
2779 lock_sock(sk);
2780
2781 switch (optname) {
2782 case TCP_MAXSEG:
2783
2784
2785
2786
2787 if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) {
2788 err = -EINVAL;
2789 break;
2790 }
2791 tp->rx_opt.user_mss = val;
2792 break;
2793
2794 case TCP_NODELAY:
2795 if (val) {
2796
2797
2798
2799
2800
2801
2802
2803
2804 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2805 tcp_push_pending_frames(sk);
2806 } else {
2807 tp->nonagle &= ~TCP_NAGLE_OFF;
2808 }
2809 break;
2810
2811 case TCP_THIN_LINEAR_TIMEOUTS:
2812 if (val < 0 || val > 1)
2813 err = -EINVAL;
2814 else
2815 tp->thin_lto = val;
2816 break;
2817
2818 case TCP_THIN_DUPACK:
2819 if (val < 0 || val > 1)
2820 err = -EINVAL;
2821 break;
2822
2823 case TCP_REPAIR:
2824 if (!tcp_can_repair_sock(sk))
2825 err = -EPERM;
2826 else if (val == TCP_REPAIR_ON) {
2827 tp->repair = 1;
2828 sk->sk_reuse = SK_FORCE_REUSE;
2829 tp->repair_queue = TCP_NO_QUEUE;
2830 } else if (val == TCP_REPAIR_OFF) {
2831 tp->repair = 0;
2832 sk->sk_reuse = SK_NO_REUSE;
2833 tcp_send_window_probe(sk);
2834 } else if (val == TCP_REPAIR_OFF_NO_WP) {
2835 tp->repair = 0;
2836 sk->sk_reuse = SK_NO_REUSE;
2837 } else
2838 err = -EINVAL;
2839
2840 break;
2841
2842 case TCP_REPAIR_QUEUE:
2843 if (!tp->repair)
2844 err = -EPERM;
2845 else if ((unsigned int)val < TCP_QUEUES_NR)
2846 tp->repair_queue = val;
2847 else
2848 err = -EINVAL;
2849 break;
2850
2851 case TCP_QUEUE_SEQ:
2852 if (sk->sk_state != TCP_CLOSE)
2853 err = -EPERM;
2854 else if (tp->repair_queue == TCP_SEND_QUEUE)
2855 tp->write_seq = val;
2856 else if (tp->repair_queue == TCP_RECV_QUEUE)
2857 tp->rcv_nxt = val;
2858 else
2859 err = -EINVAL;
2860 break;
2861
2862 case TCP_REPAIR_OPTIONS:
2863 if (!tp->repair)
2864 err = -EINVAL;
2865 else if (sk->sk_state == TCP_ESTABLISHED)
2866 err = tcp_repair_options_est(sk,
2867 (struct tcp_repair_opt __user *)optval,
2868 optlen);
2869 else
2870 err = -EPERM;
2871 break;
2872
2873 case TCP_CORK:
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885 if (val) {
2886 tp->nonagle |= TCP_NAGLE_CORK;
2887 } else {
2888 tp->nonagle &= ~TCP_NAGLE_CORK;
2889 if (tp->nonagle&TCP_NAGLE_OFF)
2890 tp->nonagle |= TCP_NAGLE_PUSH;
2891 tcp_push_pending_frames(sk);
2892 }
2893 break;
2894
2895 case TCP_KEEPIDLE:
2896 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2897 err = -EINVAL;
2898 else {
2899 tp->keepalive_time = val * HZ;
2900 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2901 !((1 << sk->sk_state) &
2902 (TCPF_CLOSE | TCPF_LISTEN))) {
2903 u32 elapsed = keepalive_time_elapsed(tp);
2904 if (tp->keepalive_time > elapsed)
2905 elapsed = tp->keepalive_time - elapsed;
2906 else
2907 elapsed = 0;
2908 inet_csk_reset_keepalive_timer(sk, elapsed);
2909 }
2910 }
2911 break;
2912 case TCP_KEEPINTVL:
2913 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2914 err = -EINVAL;
2915 else
2916 tp->keepalive_intvl = val * HZ;
2917 break;
2918 case TCP_KEEPCNT:
2919 if (val < 1 || val > MAX_TCP_KEEPCNT)
2920 err = -EINVAL;
2921 else
2922 tp->keepalive_probes = val;
2923 break;
2924 case TCP_SYNCNT:
2925 if (val < 1 || val > MAX_TCP_SYNCNT)
2926 err = -EINVAL;
2927 else
2928 icsk->icsk_syn_retries = val;
2929 break;
2930
2931 case TCP_SAVE_SYN:
2932 if (val < 0 || val > 1)
2933 err = -EINVAL;
2934 else
2935 tp->save_syn = val;
2936 break;
2937
2938 case TCP_LINGER2:
2939 if (val < 0)
2940 tp->linger2 = -1;
2941 else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ)
2942 tp->linger2 = 0;
2943 else
2944 tp->linger2 = val * HZ;
2945 break;
2946
2947 case TCP_DEFER_ACCEPT:
2948
2949 icsk->icsk_accept_queue.rskq_defer_accept =
2950 secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
2951 TCP_RTO_MAX / HZ);
2952 break;
2953
2954 case TCP_WINDOW_CLAMP:
2955 if (!val) {
2956 if (sk->sk_state != TCP_CLOSE) {
2957 err = -EINVAL;
2958 break;
2959 }
2960 tp->window_clamp = 0;
2961 } else
2962 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2963 SOCK_MIN_RCVBUF / 2 : val;
2964 break;
2965
2966 case TCP_QUICKACK:
2967 if (!val) {
2968 icsk->icsk_ack.pingpong = 1;
2969 } else {
2970 icsk->icsk_ack.pingpong = 0;
2971 if ((1 << sk->sk_state) &
2972 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2973 inet_csk_ack_scheduled(sk)) {
2974 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2975 tcp_cleanup_rbuf(sk, 1);
2976 if (!(val & 1))
2977 icsk->icsk_ack.pingpong = 1;
2978 }
2979 }
2980 break;
2981
2982#ifdef CONFIG_TCP_MD5SIG
2983 case TCP_MD5SIG:
2984 case TCP_MD5SIG_EXT:
2985 if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
2986 err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
2987 else
2988 err = -EINVAL;
2989 break;
2990#endif
2991 case TCP_USER_TIMEOUT:
2992
2993
2994
2995 if (val < 0)
2996 err = -EINVAL;
2997 else
2998 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2999 break;
3000
3001 case TCP_FASTOPEN:
3002 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
3003 TCPF_LISTEN))) {
3004 tcp_fastopen_init_key_once(net);
3005
3006 fastopen_queue_tune(sk, val);
3007 } else {
3008 err = -EINVAL;
3009 }
3010 break;
3011 case TCP_FASTOPEN_CONNECT:
3012 if (val > 1 || val < 0) {
3013 err = -EINVAL;
3014 } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
3015 if (sk->sk_state == TCP_CLOSE)
3016 tp->fastopen_connect = val;
3017 else
3018 err = -EINVAL;
3019 } else {
3020 err = -EOPNOTSUPP;
3021 }
3022 break;
3023 case TCP_FASTOPEN_NO_COOKIE:
3024 if (val > 1 || val < 0)
3025 err = -EINVAL;
3026 else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3027 err = -EINVAL;
3028 else
3029 tp->fastopen_no_cookie = val;
3030 break;
3031 case TCP_TIMESTAMP:
3032 if (!tp->repair)
3033 err = -EPERM;
3034 else
3035 tp->tsoffset = val - tcp_time_stamp_raw();
3036 break;
3037 case TCP_REPAIR_WINDOW:
3038 err = tcp_repair_set_window(tp, optval, optlen);
3039 break;
3040 case TCP_NOTSENT_LOWAT:
3041 tp->notsent_lowat = val;
3042 sk->sk_write_space(sk);
3043 break;
3044 case TCP_INQ:
3045 if (val > 1 || val < 0)
3046 err = -EINVAL;
3047 else
3048 tp->recvmsg_inq = val;
3049 break;
3050 default:
3051 err = -ENOPROTOOPT;
3052 break;
3053 }
3054
3055 release_sock(sk);
3056 return err;
3057}
3058
3059int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
3060 unsigned int optlen)
3061{
3062 const struct inet_connection_sock *icsk = inet_csk(sk);
3063
3064 if (level != SOL_TCP)
3065 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
3066 optval, optlen);
3067 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
3068}
3069EXPORT_SYMBOL(tcp_setsockopt);
3070
3071#ifdef CONFIG_COMPAT
3072int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
3073 char __user *optval, unsigned int optlen)
3074{
3075 if (level != SOL_TCP)
3076 return inet_csk_compat_setsockopt(sk, level, optname,
3077 optval, optlen);
3078 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
3079}
3080EXPORT_SYMBOL(compat_tcp_setsockopt);
3081#endif
3082
3083static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
3084 struct tcp_info *info)
3085{
3086 u64 stats[__TCP_CHRONO_MAX], total = 0;
3087 enum tcp_chrono i;
3088
3089 for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
3090 stats[i] = tp->chrono_stat[i - 1];
3091 if (i == tp->chrono_type)
3092 stats[i] += tcp_jiffies32 - tp->chrono_start;
3093 stats[i] *= USEC_PER_SEC / HZ;
3094 total += stats[i];
3095 }
3096
3097 info->tcpi_busy_time = total;
3098 info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
3099 info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
3100}
3101
3102
3103void tcp_get_info(struct sock *sk, struct tcp_info *info)
3104{
3105 const struct tcp_sock *tp = tcp_sk(sk);
3106 const struct inet_connection_sock *icsk = inet_csk(sk);
3107 u32 now;
3108 u64 rate64;
3109 bool slow;
3110 u32 rate;
3111
3112 memset(info, 0, sizeof(*info));
3113 if (sk->sk_type != SOCK_STREAM)
3114 return;
3115
3116 info->tcpi_state = inet_sk_state_load(sk);
3117
3118
3119 rate = READ_ONCE(sk->sk_pacing_rate);
3120 rate64 = rate != ~0U ? rate : ~0ULL;
3121 info->tcpi_pacing_rate = rate64;
3122
3123 rate = READ_ONCE(sk->sk_max_pacing_rate);
3124 rate64 = rate != ~0U ? rate : ~0ULL;
3125 info->tcpi_max_pacing_rate = rate64;
3126
3127 info->tcpi_reordering = tp->reordering;
3128 info->tcpi_snd_cwnd = tp->snd_cwnd;
3129
3130 if (info->tcpi_state == TCP_LISTEN) {
3131
3132
3133
3134
3135 info->tcpi_unacked = sk->sk_ack_backlog;
3136 info->tcpi_sacked = sk->sk_max_ack_backlog;
3137 return;
3138 }
3139
3140 slow = lock_sock_fast(sk);
3141
3142 info->tcpi_ca_state = icsk->icsk_ca_state;
3143 info->tcpi_retransmits = icsk->icsk_retransmits;
3144 info->tcpi_probes = icsk->icsk_probes_out;
3145 info->tcpi_backoff = icsk->icsk_backoff;
3146
3147 if (tp->rx_opt.tstamp_ok)
3148 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
3149 if (tcp_is_sack(tp))
3150 info->tcpi_options |= TCPI_OPT_SACK;
3151 if (tp->rx_opt.wscale_ok) {
3152 info->tcpi_options |= TCPI_OPT_WSCALE;
3153 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
3154 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
3155 }
3156
3157 if (tp->ecn_flags & TCP_ECN_OK)
3158 info->tcpi_options |= TCPI_OPT_ECN;
3159 if (tp->ecn_flags & TCP_ECN_SEEN)
3160 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
3161 if (tp->syn_data_acked)
3162 info->tcpi_options |= TCPI_OPT_SYN_DATA;
3163
3164 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
3165 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
3166 info->tcpi_snd_mss = tp->mss_cache;
3167 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
3168
3169 info->tcpi_unacked = tp->packets_out;
3170 info->tcpi_sacked = tp->sacked_out;
3171
3172 info->tcpi_lost = tp->lost_out;
3173 info->tcpi_retrans = tp->retrans_out;
3174
3175 now = tcp_jiffies32;
3176 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
3177 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
3178 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
3179
3180 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
3181 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
3182 info->tcpi_rtt = tp->srtt_us >> 3;
3183 info->tcpi_rttvar = tp->mdev_us >> 2;
3184 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
3185 info->tcpi_advmss = tp->advmss;
3186
3187 info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
3188 info->tcpi_rcv_space = tp->rcvq_space.space;
3189
3190 info->tcpi_total_retrans = tp->total_retrans;
3191
3192 info->tcpi_bytes_acked = tp->bytes_acked;
3193 info->tcpi_bytes_received = tp->bytes_received;
3194 info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
3195 tcp_get_info_chrono_stats(tp, info);
3196
3197 info->tcpi_segs_out = tp->segs_out;
3198 info->tcpi_segs_in = tp->segs_in;
3199
3200 info->tcpi_min_rtt = tcp_min_rtt(tp);
3201 info->tcpi_data_segs_in = tp->data_segs_in;
3202 info->tcpi_data_segs_out = tp->data_segs_out;
3203
3204 info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
3205 rate64 = tcp_compute_delivery_rate(tp);
3206 if (rate64)
3207 info->tcpi_delivery_rate = rate64;
3208 info->tcpi_delivered = tp->delivered;
3209 info->tcpi_delivered_ce = tp->delivered_ce;
3210 unlock_sock_fast(sk, slow);
3211}
3212EXPORT_SYMBOL_GPL(tcp_get_info);
3213
3214struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
3215{
3216 const struct tcp_sock *tp = tcp_sk(sk);
3217 struct sk_buff *stats;
3218 struct tcp_info info;
3219 u64 rate64;
3220 u32 rate;
3221
3222 stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
3223 7 * nla_total_size(sizeof(u32)) +
3224 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
3225 if (!stats)
3226 return NULL;
3227
3228 tcp_get_info_chrono_stats(tp, &info);
3229 nla_put_u64_64bit(stats, TCP_NLA_BUSY,
3230 info.tcpi_busy_time, TCP_NLA_PAD);
3231 nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
3232 info.tcpi_rwnd_limited, TCP_NLA_PAD);
3233 nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
3234 info.tcpi_sndbuf_limited, TCP_NLA_PAD);
3235 nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
3236 tp->data_segs_out, TCP_NLA_PAD);
3237 nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
3238 tp->total_retrans, TCP_NLA_PAD);
3239
3240 rate = READ_ONCE(sk->sk_pacing_rate);
3241 rate64 = rate != ~0U ? rate : ~0ULL;
3242 nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
3243
3244 rate64 = tcp_compute_delivery_rate(tp);
3245 nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
3246
3247 nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
3248 nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
3249 nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
3250
3251 nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
3252 nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
3253 nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
3254 nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
3255 nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
3256
3257 nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
3258 nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
3259
3260 return stats;
3261}
3262
3263static int do_tcp_getsockopt(struct sock *sk, int level,
3264 int optname, char __user *optval, int __user *optlen)
3265{
3266 struct inet_connection_sock *icsk = inet_csk(sk);
3267 struct tcp_sock *tp = tcp_sk(sk);
3268 struct net *net = sock_net(sk);
3269 int val, len;
3270
3271 if (get_user(len, optlen))
3272 return -EFAULT;
3273
3274 len = min_t(unsigned int, len, sizeof(int));
3275
3276 if (len < 0)
3277 return -EINVAL;
3278
3279 switch (optname) {
3280 case TCP_MAXSEG:
3281 val = tp->mss_cache;
3282 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3283 val = tp->rx_opt.user_mss;
3284 if (tp->repair)
3285 val = tp->rx_opt.mss_clamp;
3286 break;
3287 case TCP_NODELAY:
3288 val = !!(tp->nonagle&TCP_NAGLE_OFF);
3289 break;
3290 case TCP_CORK:
3291 val = !!(tp->nonagle&TCP_NAGLE_CORK);
3292 break;
3293 case TCP_KEEPIDLE:
3294 val = keepalive_time_when(tp) / HZ;
3295 break;
3296 case TCP_KEEPINTVL:
3297 val = keepalive_intvl_when(tp) / HZ;
3298 break;
3299 case TCP_KEEPCNT:
3300 val = keepalive_probes(tp);
3301 break;
3302 case TCP_SYNCNT:
3303 val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
3304 break;
3305 case TCP_LINGER2:
3306 val = tp->linger2;
3307 if (val >= 0)
3308 val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
3309 break;
3310 case TCP_DEFER_ACCEPT:
3311 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
3312 TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
3313 break;
3314 case TCP_WINDOW_CLAMP:
3315 val = tp->window_clamp;
3316 break;
3317 case TCP_INFO: {
3318 struct tcp_info info;
3319
3320 if (get_user(len, optlen))
3321 return -EFAULT;
3322
3323 tcp_get_info(sk, &info);
3324
3325 len = min_t(unsigned int, len, sizeof(info));
3326 if (put_user(len, optlen))
3327 return -EFAULT;
3328 if (copy_to_user(optval, &info, len))
3329 return -EFAULT;
3330 return 0;
3331 }
3332 case TCP_CC_INFO: {
3333 const struct tcp_congestion_ops *ca_ops;
3334 union tcp_cc_info info;
3335 size_t sz = 0;
3336 int attr;
3337
3338 if (get_user(len, optlen))
3339 return -EFAULT;
3340
3341 ca_ops = icsk->icsk_ca_ops;
3342 if (ca_ops && ca_ops->get_info)
3343 sz = ca_ops->get_info(sk, ~0U, &attr, &info);
3344
3345 len = min_t(unsigned int, len, sz);
3346 if (put_user(len, optlen))
3347 return -EFAULT;
3348 if (copy_to_user(optval, &info, len))
3349 return -EFAULT;
3350 return 0;
3351 }
3352 case TCP_QUICKACK:
3353 val = !icsk->icsk_ack.pingpong;
3354 break;
3355
3356 case TCP_CONGESTION:
3357 if (get_user(len, optlen))
3358 return -EFAULT;
3359 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
3360 if (put_user(len, optlen))
3361 return -EFAULT;
3362 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
3363 return -EFAULT;
3364 return 0;
3365
3366 case TCP_ULP:
3367 if (get_user(len, optlen))
3368 return -EFAULT;
3369 len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
3370 if (!icsk->icsk_ulp_ops) {
3371 if (put_user(0, optlen))
3372 return -EFAULT;
3373 return 0;
3374 }
3375 if (put_user(len, optlen))
3376 return -EFAULT;
3377 if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
3378 return -EFAULT;
3379 return 0;
3380
3381 case TCP_FASTOPEN_KEY: {
3382 __u8 key[TCP_FASTOPEN_KEY_LENGTH];
3383 struct tcp_fastopen_context *ctx;
3384
3385 if (get_user(len, optlen))
3386 return -EFAULT;
3387
3388 rcu_read_lock();
3389 ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx);
3390 if (ctx)
3391 memcpy(key, ctx->key, sizeof(key));
3392 else
3393 len = 0;
3394 rcu_read_unlock();
3395
3396 len = min_t(unsigned int, len, sizeof(key));
3397 if (put_user(len, optlen))
3398 return -EFAULT;
3399 if (copy_to_user(optval, key, len))
3400 return -EFAULT;
3401 return 0;
3402 }
3403 case TCP_THIN_LINEAR_TIMEOUTS:
3404 val = tp->thin_lto;
3405 break;
3406
3407 case TCP_THIN_DUPACK:
3408 val = 0;
3409 break;
3410
3411 case TCP_REPAIR:
3412 val = tp->repair;
3413 break;
3414
3415 case TCP_REPAIR_QUEUE:
3416 if (tp->repair)
3417 val = tp->repair_queue;
3418 else
3419 return -EINVAL;
3420 break;
3421
3422 case TCP_REPAIR_WINDOW: {
3423 struct tcp_repair_window opt;
3424
3425 if (get_user(len, optlen))
3426 return -EFAULT;
3427
3428 if (len != sizeof(opt))
3429 return -EINVAL;
3430
3431 if (!tp->repair)
3432 return -EPERM;
3433
3434 opt.snd_wl1 = tp->snd_wl1;
3435 opt.snd_wnd = tp->snd_wnd;
3436 opt.max_window = tp->max_window;
3437 opt.rcv_wnd = tp->rcv_wnd;
3438 opt.rcv_wup = tp->rcv_wup;
3439
3440 if (copy_to_user(optval, &opt, len))
3441 return -EFAULT;
3442 return 0;
3443 }
3444 case TCP_QUEUE_SEQ:
3445 if (tp->repair_queue == TCP_SEND_QUEUE)
3446 val = tp->write_seq;
3447 else if (tp->repair_queue == TCP_RECV_QUEUE)
3448 val = tp->rcv_nxt;
3449 else
3450 return -EINVAL;
3451 break;
3452
3453 case TCP_USER_TIMEOUT:
3454 val = jiffies_to_msecs(icsk->icsk_user_timeout);
3455 break;
3456
3457 case TCP_FASTOPEN:
3458 val = icsk->icsk_accept_queue.fastopenq.max_qlen;
3459 break;
3460
3461 case TCP_FASTOPEN_CONNECT:
3462 val = tp->fastopen_connect;
3463 break;
3464
3465 case TCP_FASTOPEN_NO_COOKIE:
3466 val = tp->fastopen_no_cookie;
3467 break;
3468
3469 case TCP_TIMESTAMP:
3470 val = tcp_time_stamp_raw() + tp->tsoffset;
3471 break;
3472 case TCP_NOTSENT_LOWAT:
3473 val = tp->notsent_lowat;
3474 break;
3475 case TCP_INQ:
3476 val = tp->recvmsg_inq;
3477 break;
3478 case TCP_SAVE_SYN:
3479 val = tp->save_syn;
3480 break;
3481 case TCP_SAVED_SYN: {
3482 if (get_user(len, optlen))
3483 return -EFAULT;
3484
3485 lock_sock(sk);
3486 if (tp->saved_syn) {
3487 if (len < tp->saved_syn[0]) {
3488 if (put_user(tp->saved_syn[0], optlen)) {
3489 release_sock(sk);
3490 return -EFAULT;
3491 }
3492 release_sock(sk);
3493 return -EINVAL;
3494 }
3495 len = tp->saved_syn[0];
3496 if (put_user(len, optlen)) {
3497 release_sock(sk);
3498 return -EFAULT;
3499 }
3500 if (copy_to_user(optval, tp->saved_syn + 1, len)) {
3501 release_sock(sk);
3502 return -EFAULT;
3503 }
3504 tcp_saved_syn_free(tp);
3505 release_sock(sk);
3506 } else {
3507 release_sock(sk);
3508 len = 0;
3509 if (put_user(len, optlen))
3510 return -EFAULT;
3511 }
3512 return 0;
3513 }
3514#ifdef CONFIG_MMU
3515 case TCP_ZEROCOPY_RECEIVE: {
3516 struct tcp_zerocopy_receive zc;
3517 int err;
3518
3519 if (get_user(len, optlen))
3520 return -EFAULT;
3521 if (len != sizeof(zc))
3522 return -EINVAL;
3523 if (copy_from_user(&zc, optval, len))
3524 return -EFAULT;
3525 lock_sock(sk);
3526 err = tcp_zerocopy_receive(sk, &zc);
3527 release_sock(sk);
3528 if (!err && copy_to_user(optval, &zc, len))
3529 err = -EFAULT;
3530 return err;
3531 }
3532#endif
3533 default:
3534 return -ENOPROTOOPT;
3535 }
3536
3537 if (put_user(len, optlen))
3538 return -EFAULT;
3539 if (copy_to_user(optval, &val, len))
3540 return -EFAULT;
3541 return 0;
3542}
3543
3544int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
3545 int __user *optlen)
3546{
3547 struct inet_connection_sock *icsk = inet_csk(sk);
3548
3549 if (level != SOL_TCP)
3550 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
3551 optval, optlen);
3552 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
3553}
3554EXPORT_SYMBOL(tcp_getsockopt);
3555
3556#ifdef CONFIG_COMPAT
3557int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
3558 char __user *optval, int __user *optlen)
3559{
3560 if (level != SOL_TCP)
3561 return inet_csk_compat_getsockopt(sk, level, optname,
3562 optval, optlen);
3563 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
3564}
3565EXPORT_SYMBOL(compat_tcp_getsockopt);
3566#endif
3567
3568#ifdef CONFIG_TCP_MD5SIG
3569static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
3570static DEFINE_MUTEX(tcp_md5sig_mutex);
3571static bool tcp_md5sig_pool_populated = false;
3572
3573static void __tcp_alloc_md5sig_pool(void)
3574{
3575 struct crypto_ahash *hash;
3576 int cpu;
3577
3578 hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
3579 if (IS_ERR(hash))
3580 return;
3581
3582 for_each_possible_cpu(cpu) {
3583 void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
3584 struct ahash_request *req;
3585
3586 if (!scratch) {
3587 scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
3588 sizeof(struct tcphdr),
3589 GFP_KERNEL,
3590 cpu_to_node(cpu));
3591 if (!scratch)
3592 return;
3593 per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
3594 }
3595 if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
3596 continue;
3597
3598 req = ahash_request_alloc(hash, GFP_KERNEL);
3599 if (!req)
3600 return;
3601
3602 ahash_request_set_callback(req, 0, NULL, NULL);
3603
3604 per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
3605 }
3606
3607
3608
3609 smp_wmb();
3610 tcp_md5sig_pool_populated = true;
3611}
3612
3613bool tcp_alloc_md5sig_pool(void)
3614{
3615 if (unlikely(!tcp_md5sig_pool_populated)) {
3616 mutex_lock(&tcp_md5sig_mutex);
3617
3618 if (!tcp_md5sig_pool_populated)
3619 __tcp_alloc_md5sig_pool();
3620
3621 mutex_unlock(&tcp_md5sig_mutex);
3622 }
3623 return tcp_md5sig_pool_populated;
3624}
3625EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
3636{
3637 local_bh_disable();
3638
3639 if (tcp_md5sig_pool_populated) {
3640
3641 smp_rmb();
3642 return this_cpu_ptr(&tcp_md5sig_pool);
3643 }
3644 local_bh_enable();
3645 return NULL;
3646}
3647EXPORT_SYMBOL(tcp_get_md5sig_pool);
3648
3649int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3650 const struct sk_buff *skb, unsigned int header_len)
3651{
3652 struct scatterlist sg;
3653 const struct tcphdr *tp = tcp_hdr(skb);
3654 struct ahash_request *req = hp->md5_req;
3655 unsigned int i;
3656 const unsigned int head_data_len = skb_headlen(skb) > header_len ?
3657 skb_headlen(skb) - header_len : 0;
3658 const struct skb_shared_info *shi = skb_shinfo(skb);
3659 struct sk_buff *frag_iter;
3660
3661 sg_init_table(&sg, 1);
3662
3663 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
3664 ahash_request_set_crypt(req, &sg, NULL, head_data_len);
3665 if (crypto_ahash_update(req))
3666 return 1;
3667
3668 for (i = 0; i < shi->nr_frags; ++i) {
3669 const struct skb_frag_struct *f = &shi->frags[i];
3670 unsigned int offset = f->page_offset;
3671 struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
3672
3673 sg_set_page(&sg, page, skb_frag_size(f),
3674 offset_in_page(offset));
3675 ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
3676 if (crypto_ahash_update(req))
3677 return 1;
3678 }
3679
3680 skb_walk_frags(skb, frag_iter)
3681 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
3682 return 1;
3683
3684 return 0;
3685}
3686EXPORT_SYMBOL(tcp_md5_hash_skb_data);
3687
3688int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
3689{
3690 struct scatterlist sg;
3691
3692 sg_init_one(&sg, key->key, key->keylen);
3693 ahash_request_set_crypt(hp->md5_req, &sg, NULL, key->keylen);
3694 return crypto_ahash_update(hp->md5_req);
3695}
3696EXPORT_SYMBOL(tcp_md5_hash_key);
3697
3698#endif
3699
3700void tcp_done(struct sock *sk)
3701{
3702 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3703
3704 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3705 TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3706
3707 tcp_set_state(sk, TCP_CLOSE);
3708 tcp_clear_xmit_timers(sk);
3709 if (req)
3710 reqsk_fastopen_remove(sk, req, false);
3711
3712 sk->sk_shutdown = SHUTDOWN_MASK;
3713
3714 if (!sock_flag(sk, SOCK_DEAD))
3715 sk->sk_state_change(sk);
3716 else
3717 inet_csk_destroy_sock(sk);
3718}
3719EXPORT_SYMBOL_GPL(tcp_done);
3720
3721int tcp_abort(struct sock *sk, int err)
3722{
3723 if (!sk_fullsock(sk)) {
3724 if (sk->sk_state == TCP_NEW_SYN_RECV) {
3725 struct request_sock *req = inet_reqsk(sk);
3726
3727 local_bh_disable();
3728 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
3729 local_bh_enable();
3730 return 0;
3731 }
3732 return -EOPNOTSUPP;
3733 }
3734
3735
3736 lock_sock(sk);
3737
3738 if (sk->sk_state == TCP_LISTEN) {
3739 tcp_set_state(sk, TCP_CLOSE);
3740 inet_csk_listen_stop(sk);
3741 }
3742
3743
3744 local_bh_disable();
3745 bh_lock_sock(sk);
3746
3747 if (!sock_flag(sk, SOCK_DEAD)) {
3748 sk->sk_err = err;
3749
3750 smp_wmb();
3751 sk->sk_error_report(sk);
3752 if (tcp_need_reset(sk->sk_state))
3753 tcp_send_active_reset(sk, GFP_ATOMIC);
3754 tcp_done(sk);
3755 }
3756
3757 bh_unlock_sock(sk);
3758 local_bh_enable();
3759 tcp_write_queue_purge(sk);
3760 release_sock(sk);
3761 return 0;
3762}
3763EXPORT_SYMBOL_GPL(tcp_abort);
3764
3765extern struct tcp_congestion_ops tcp_reno;
3766
3767static __initdata unsigned long thash_entries;
3768static int __init set_thash_entries(char *str)
3769{
3770 ssize_t ret;
3771
3772 if (!str)
3773 return 0;
3774
3775 ret = kstrtoul(str, 0, &thash_entries);
3776 if (ret)
3777 return 0;
3778
3779 return 1;
3780}
3781__setup("thash_entries=", set_thash_entries);
3782
3783static void __init tcp_init_mem(void)
3784{
3785 unsigned long limit = nr_free_buffer_pages() / 16;
3786
3787 limit = max(limit, 128UL);
3788 sysctl_tcp_mem[0] = limit / 4 * 3;
3789 sysctl_tcp_mem[1] = limit;
3790 sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
3791}
3792
3793void __init tcp_init(void)
3794{
3795 int max_rshare, max_wshare, cnt;
3796 unsigned long limit;
3797 unsigned int i;
3798
3799 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
3800 FIELD_SIZEOF(struct sk_buff, cb));
3801
3802 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
3803 percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
3804 inet_hashinfo_init(&tcp_hashinfo);
3805 inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
3806 thash_entries, 21,
3807 0, 64 * 1024);
3808 tcp_hashinfo.bind_bucket_cachep =
3809 kmem_cache_create("tcp_bind_bucket",
3810 sizeof(struct inet_bind_bucket), 0,
3811 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3812
3813
3814
3815
3816
3817
3818 tcp_hashinfo.ehash =
3819 alloc_large_system_hash("TCP established",
3820 sizeof(struct inet_ehash_bucket),
3821 thash_entries,
3822 17,
3823 0,
3824 NULL,
3825 &tcp_hashinfo.ehash_mask,
3826 0,
3827 thash_entries ? 0 : 512 * 1024);
3828 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
3829 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
3830
3831 if (inet_ehash_locks_alloc(&tcp_hashinfo))
3832 panic("TCP: failed to alloc ehash_locks");
3833 tcp_hashinfo.bhash =
3834 alloc_large_system_hash("TCP bind",
3835 sizeof(struct inet_bind_hashbucket),
3836 tcp_hashinfo.ehash_mask + 1,
3837 17,
3838 0,
3839 &tcp_hashinfo.bhash_size,
3840 NULL,
3841 0,
3842 64 * 1024);
3843 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
3844 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
3845 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
3846 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
3847 }
3848
3849
3850 cnt = tcp_hashinfo.ehash_mask + 1;
3851 sysctl_tcp_max_orphans = cnt / 2;
3852
3853 tcp_init_mem();
3854
3855 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
3856 max_wshare = min(4UL*1024*1024, limit);
3857 max_rshare = min(6UL*1024*1024, limit);
3858
3859 init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
3860 init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
3861 init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
3862
3863 init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3864 init_net.ipv4.sysctl_tcp_rmem[1] = 87380;
3865 init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare);
3866
3867 pr_info("Hash tables configured (established %u bind %u)\n",
3868 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3869
3870 tcp_v4_init();
3871 tcp_metrics_init();
3872 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
3873 tcp_tasklet_init();
3874}
3875