1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248#define pr_fmt(fmt) "TCP: " fmt
249
250#include <crypto/hash.h>
251#include <linux/kernel.h>
252#include <linux/module.h>
253#include <linux/types.h>
254#include <linux/fcntl.h>
255#include <linux/poll.h>
256#include <linux/inet_diag.h>
257#include <linux/init.h>
258#include <linux/fs.h>
259#include <linux/skbuff.h>
260#include <linux/scatterlist.h>
261#include <linux/splice.h>
262#include <linux/net.h>
263#include <linux/socket.h>
264#include <linux/random.h>
265#include <linux/memblock.h>
266#include <linux/highmem.h>
267#include <linux/swap.h>
268#include <linux/cache.h>
269#include <linux/err.h>
270#include <linux/time.h>
271#include <linux/slab.h>
272#include <linux/errqueue.h>
273#include <linux/static_key.h>
274
275#include <net/icmp.h>
276#include <net/inet_common.h>
277#include <net/tcp.h>
278#include <net/xfrm.h>
279#include <net/ip.h>
280#include <net/sock.h>
281
282#include <linux/uaccess.h>
283#include <asm/ioctls.h>
284#include <net/busy_poll.h>
285
286struct percpu_counter tcp_orphan_count;
287EXPORT_SYMBOL_GPL(tcp_orphan_count);
288
289long sysctl_tcp_mem[3] __read_mostly;
290EXPORT_SYMBOL(sysctl_tcp_mem);
291
292atomic_long_t tcp_memory_allocated;
293EXPORT_SYMBOL(tcp_memory_allocated);
294
295#if IS_ENABLED(CONFIG_SMC)
296DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
297EXPORT_SYMBOL(tcp_have_smc);
298#endif
299
300
301
302
303struct percpu_counter tcp_sockets_allocated;
304EXPORT_SYMBOL(tcp_sockets_allocated);
305
306
307
308
309struct tcp_splice_state {
310 struct pipe_inode_info *pipe;
311 size_t len;
312 unsigned int flags;
313};
314
315
316
317
318
319
320
321unsigned long tcp_memory_pressure __read_mostly;
322EXPORT_SYMBOL_GPL(tcp_memory_pressure);
323
324void tcp_enter_memory_pressure(struct sock *sk)
325{
326 unsigned long val;
327
328 if (tcp_memory_pressure)
329 return;
330 val = jiffies;
331
332 if (!val)
333 val--;
334 if (!cmpxchg(&tcp_memory_pressure, 0, val))
335 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
336}
337EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
338
339void tcp_leave_memory_pressure(struct sock *sk)
340{
341 unsigned long val;
342
343 if (!tcp_memory_pressure)
344 return;
345 val = xchg(&tcp_memory_pressure, 0);
346 if (val)
347 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
348 jiffies_to_msecs(jiffies - val));
349}
350EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
351
352
353static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
354{
355 u8 res = 0;
356
357 if (seconds > 0) {
358 int period = timeout;
359
360 res = 1;
361 while (seconds > period && res < 255) {
362 res++;
363 timeout <<= 1;
364 if (timeout > rto_max)
365 timeout = rto_max;
366 period += timeout;
367 }
368 }
369 return res;
370}
371
372
373static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
374{
375 int period = 0;
376
377 if (retrans > 0) {
378 period = timeout;
379 while (--retrans) {
380 timeout <<= 1;
381 if (timeout > rto_max)
382 timeout = rto_max;
383 period += timeout;
384 }
385 }
386 return period;
387}
388
389static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
390{
391 u32 rate = READ_ONCE(tp->rate_delivered);
392 u32 intv = READ_ONCE(tp->rate_interval_us);
393 u64 rate64 = 0;
394
395 if (rate && intv) {
396 rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
397 do_div(rate64, intv);
398 }
399 return rate64;
400}
401
402
403
404
405
406
407void tcp_init_sock(struct sock *sk)
408{
409 struct inet_connection_sock *icsk = inet_csk(sk);
410 struct tcp_sock *tp = tcp_sk(sk);
411
412 tp->out_of_order_queue = RB_ROOT;
413 sk->tcp_rtx_queue = RB_ROOT;
414 tcp_init_xmit_timers(sk);
415 INIT_LIST_HEAD(&tp->tsq_node);
416 INIT_LIST_HEAD(&tp->tsorted_sent_queue);
417
418 icsk->icsk_rto = TCP_TIMEOUT_INIT;
419 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
420 minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
421
422
423
424
425
426
427 tp->snd_cwnd = TCP_INIT_CWND;
428
429
430 tp->app_limited = ~0U;
431
432
433
434
435 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
436 tp->snd_cwnd_clamp = ~0;
437 tp->mss_cache = TCP_MSS_DEFAULT;
438
439 tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
440 tcp_assign_congestion_control(sk);
441
442 tp->tsoffset = 0;
443 tp->rack.reo_wnd_steps = 1;
444
445 sk->sk_state = TCP_CLOSE;
446
447 sk->sk_write_space = sk_stream_write_space;
448 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
449
450 icsk->icsk_sync_mss = tcp_sync_mss;
451
452 sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
453 sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
454
455 sk_sockets_allocated_inc(sk);
456 sk->sk_route_forced_caps = NETIF_F_GSO;
457}
458EXPORT_SYMBOL(tcp_init_sock);
459
460void tcp_init_transfer(struct sock *sk, int bpf_op)
461{
462 struct inet_connection_sock *icsk = inet_csk(sk);
463
464 tcp_mtup_init(sk);
465 icsk->icsk_af_ops->rebuild_header(sk);
466 tcp_init_metrics(sk);
467 tcp_call_bpf(sk, bpf_op, 0, NULL);
468 tcp_init_congestion_control(sk);
469 tcp_init_buffer_space(sk);
470}
471
472static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
473{
474 struct sk_buff *skb = tcp_write_queue_tail(sk);
475
476 if (tsflags && skb) {
477 struct skb_shared_info *shinfo = skb_shinfo(skb);
478 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
479
480 sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
481 if (tsflags & SOF_TIMESTAMPING_TX_ACK)
482 tcb->txstamp_ack = 1;
483 if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
484 shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
485 }
486}
487
488static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
489 int target, struct sock *sk)
490{
491 return (tp->rcv_nxt - tp->copied_seq >= target) ||
492 (sk->sk_prot->stream_memory_read ?
493 sk->sk_prot->stream_memory_read(sk) : false);
494}
495
496
497
498
499
500
501
502
503__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
504{
505 __poll_t mask;
506 struct sock *sk = sock->sk;
507 const struct tcp_sock *tp = tcp_sk(sk);
508 int state;
509
510 sock_poll_wait(file, sock, wait);
511
512 state = inet_sk_state_load(sk);
513 if (state == TCP_LISTEN)
514 return inet_csk_listen_poll(sk);
515
516
517
518
519
520
521 mask = 0;
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550 if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
551 mask |= EPOLLHUP;
552 if (sk->sk_shutdown & RCV_SHUTDOWN)
553 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
554
555
556 if (state != TCP_SYN_SENT &&
557 (state != TCP_SYN_RECV || tp->fastopen_rsk)) {
558 int target = sock_rcvlowat(sk, 0, INT_MAX);
559
560 if (tp->urg_seq == tp->copied_seq &&
561 !sock_flag(sk, SOCK_URGINLINE) &&
562 tp->urg_data)
563 target++;
564
565 if (tcp_stream_is_readable(tp, target, sk))
566 mask |= EPOLLIN | EPOLLRDNORM;
567
568 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
569 if (sk_stream_is_writeable(sk)) {
570 mask |= EPOLLOUT | EPOLLWRNORM;
571 } else {
572 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
573 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
574
575
576
577
578
579
580 smp_mb__after_atomic();
581 if (sk_stream_is_writeable(sk))
582 mask |= EPOLLOUT | EPOLLWRNORM;
583 }
584 } else
585 mask |= EPOLLOUT | EPOLLWRNORM;
586
587 if (tp->urg_data & TCP_URG_VALID)
588 mask |= EPOLLPRI;
589 } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
590
591
592
593
594 mask |= EPOLLOUT | EPOLLWRNORM;
595 }
596
597 smp_rmb();
598 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
599 mask |= EPOLLERR;
600
601 return mask;
602}
603EXPORT_SYMBOL(tcp_poll);
604
605int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
606{
607 struct tcp_sock *tp = tcp_sk(sk);
608 int answ;
609 bool slow;
610
611 switch (cmd) {
612 case SIOCINQ:
613 if (sk->sk_state == TCP_LISTEN)
614 return -EINVAL;
615
616 slow = lock_sock_fast(sk);
617 answ = tcp_inq(sk);
618 unlock_sock_fast(sk, slow);
619 break;
620 case SIOCATMARK:
621 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
622 break;
623 case SIOCOUTQ:
624 if (sk->sk_state == TCP_LISTEN)
625 return -EINVAL;
626
627 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
628 answ = 0;
629 else
630 answ = tp->write_seq - tp->snd_una;
631 break;
632 case SIOCOUTQNSD:
633 if (sk->sk_state == TCP_LISTEN)
634 return -EINVAL;
635
636 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
637 answ = 0;
638 else
639 answ = tp->write_seq - tp->snd_nxt;
640 break;
641 default:
642 return -ENOIOCTLCMD;
643 }
644
645 return put_user(answ, (int __user *)arg);
646}
647EXPORT_SYMBOL(tcp_ioctl);
648
649static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
650{
651 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
652 tp->pushed_seq = tp->write_seq;
653}
654
655static inline bool forced_push(const struct tcp_sock *tp)
656{
657 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
658}
659
660static void skb_entail(struct sock *sk, struct sk_buff *skb)
661{
662 struct tcp_sock *tp = tcp_sk(sk);
663 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
664
665 skb->csum = 0;
666 tcb->seq = tcb->end_seq = tp->write_seq;
667 tcb->tcp_flags = TCPHDR_ACK;
668 tcb->sacked = 0;
669 __skb_header_release(skb);
670 tcp_add_write_queue_tail(sk, skb);
671 sk->sk_wmem_queued += skb->truesize;
672 sk_mem_charge(sk, skb->truesize);
673 if (tp->nonagle & TCP_NAGLE_PUSH)
674 tp->nonagle &= ~TCP_NAGLE_PUSH;
675
676 tcp_slow_start_after_idle_check(sk);
677}
678
679static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
680{
681 if (flags & MSG_OOB)
682 tp->snd_up = tp->write_seq;
683}
684
685
686
687
688
689
690
691
692
693
694
695static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
696 int size_goal)
697{
698 return skb->len < size_goal &&
699 sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
700 !tcp_rtx_queue_empty(sk) &&
701 refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
702}
703
704static void tcp_push(struct sock *sk, int flags, int mss_now,
705 int nonagle, int size_goal)
706{
707 struct tcp_sock *tp = tcp_sk(sk);
708 struct sk_buff *skb;
709
710 skb = tcp_write_queue_tail(sk);
711 if (!skb)
712 return;
713 if (!(flags & MSG_MORE) || forced_push(tp))
714 tcp_mark_push(tp, skb);
715
716 tcp_mark_urg(tp, flags);
717
718 if (tcp_should_autocork(sk, skb, size_goal)) {
719
720
721 if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
722 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
723 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
724 }
725
726
727
728 if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
729 return;
730 }
731
732 if (flags & MSG_MORE)
733 nonagle = TCP_NAGLE_CORK;
734
735 __tcp_push_pending_frames(sk, mss_now, nonagle);
736}
737
738static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
739 unsigned int offset, size_t len)
740{
741 struct tcp_splice_state *tss = rd_desc->arg.data;
742 int ret;
743
744 ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
745 min(rd_desc->count, len), tss->flags);
746 if (ret > 0)
747 rd_desc->count -= ret;
748 return ret;
749}
750
751static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
752{
753
754 read_descriptor_t rd_desc = {
755 .arg.data = tss,
756 .count = tss->len,
757 };
758
759 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
760}
761
762
763
764
765
766
767
768
769
770
771
772
773
774ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
775 struct pipe_inode_info *pipe, size_t len,
776 unsigned int flags)
777{
778 struct sock *sk = sock->sk;
779 struct tcp_splice_state tss = {
780 .pipe = pipe,
781 .len = len,
782 .flags = flags,
783 };
784 long timeo;
785 ssize_t spliced;
786 int ret;
787
788 sock_rps_record_flow(sk);
789
790
791
792 if (unlikely(*ppos))
793 return -ESPIPE;
794
795 ret = spliced = 0;
796
797 lock_sock(sk);
798
799 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
800 while (tss.len) {
801 ret = __tcp_splice_read(sk, &tss);
802 if (ret < 0)
803 break;
804 else if (!ret) {
805 if (spliced)
806 break;
807 if (sock_flag(sk, SOCK_DONE))
808 break;
809 if (sk->sk_err) {
810 ret = sock_error(sk);
811 break;
812 }
813 if (sk->sk_shutdown & RCV_SHUTDOWN)
814 break;
815 if (sk->sk_state == TCP_CLOSE) {
816
817
818
819
820 ret = -ENOTCONN;
821 break;
822 }
823 if (!timeo) {
824 ret = -EAGAIN;
825 break;
826 }
827
828
829
830
831 if (!skb_queue_empty(&sk->sk_receive_queue))
832 break;
833 sk_wait_data(sk, &timeo, NULL);
834 if (signal_pending(current)) {
835 ret = sock_intr_errno(timeo);
836 break;
837 }
838 continue;
839 }
840 tss.len -= ret;
841 spliced += ret;
842
843 if (!timeo)
844 break;
845 release_sock(sk);
846 lock_sock(sk);
847
848 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
849 (sk->sk_shutdown & RCV_SHUTDOWN) ||
850 signal_pending(current))
851 break;
852 }
853
854 release_sock(sk);
855
856 if (spliced)
857 return spliced;
858
859 return ret;
860}
861EXPORT_SYMBOL(tcp_splice_read);
862
863struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
864 bool force_schedule)
865{
866 struct sk_buff *skb;
867
868
869 size = ALIGN(size, 4);
870
871 if (unlikely(tcp_under_memory_pressure(sk)))
872 sk_mem_reclaim_partial(sk);
873
874 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
875 if (likely(skb)) {
876 bool mem_scheduled;
877
878 if (force_schedule) {
879 mem_scheduled = true;
880 sk_forced_mem_schedule(sk, skb->truesize);
881 } else {
882 mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
883 }
884 if (likely(mem_scheduled)) {
885 skb_reserve(skb, sk->sk_prot->max_header);
886
887
888
889
890 skb->reserved_tailroom = skb->end - skb->tail - size;
891 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
892 return skb;
893 }
894 __kfree_skb(skb);
895 } else {
896 sk->sk_prot->enter_memory_pressure(sk);
897 sk_stream_moderate_sndbuf(sk);
898 }
899 return NULL;
900}
901
902static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
903 int large_allowed)
904{
905 struct tcp_sock *tp = tcp_sk(sk);
906 u32 new_size_goal, size_goal;
907
908 if (!large_allowed)
909 return mss_now;
910
911
912 new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
913 new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
914
915
916 size_goal = tp->gso_segs * mss_now;
917 if (unlikely(new_size_goal < size_goal ||
918 new_size_goal >= size_goal + mss_now)) {
919 tp->gso_segs = min_t(u16, new_size_goal / mss_now,
920 sk->sk_gso_max_segs);
921 size_goal = tp->gso_segs * mss_now;
922 }
923
924 return max(size_goal, mss_now);
925}
926
927static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
928{
929 int mss_now;
930
931 mss_now = tcp_current_mss(sk);
932 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
933
934 return mss_now;
935}
936
937ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
938 size_t size, int flags)
939{
940 struct tcp_sock *tp = tcp_sk(sk);
941 int mss_now, size_goal;
942 int err;
943 ssize_t copied;
944 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
945
946
947
948
949
950 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
951 !tcp_passive_fastopen(sk)) {
952 err = sk_stream_wait_connect(sk, &timeo);
953 if (err != 0)
954 goto out_err;
955 }
956
957 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
958
959 mss_now = tcp_send_mss(sk, &size_goal, flags);
960 copied = 0;
961
962 err = -EPIPE;
963 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
964 goto out_err;
965
966 while (size > 0) {
967 struct sk_buff *skb = tcp_write_queue_tail(sk);
968 int copy, i;
969 bool can_coalesce;
970
971 if (!skb || (copy = size_goal - skb->len) <= 0 ||
972 !tcp_skb_can_collapse_to(skb)) {
973new_segment:
974 if (!sk_stream_memory_free(sk))
975 goto wait_for_sndbuf;
976
977 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
978 tcp_rtx_and_write_queues_empty(sk));
979 if (!skb)
980 goto wait_for_memory;
981
982 skb_entail(sk, skb);
983 copy = size_goal;
984 }
985
986 if (copy > size)
987 copy = size;
988
989 i = skb_shinfo(skb)->nr_frags;
990 can_coalesce = skb_can_coalesce(skb, i, page, offset);
991 if (!can_coalesce && i >= sysctl_max_skb_frags) {
992 tcp_mark_push(tp, skb);
993 goto new_segment;
994 }
995 if (!sk_wmem_schedule(sk, copy))
996 goto wait_for_memory;
997
998 if (can_coalesce) {
999 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1000 } else {
1001 get_page(page);
1002 skb_fill_page_desc(skb, i, page, offset, copy);
1003 }
1004
1005 if (!(flags & MSG_NO_SHARED_FRAGS))
1006 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
1007
1008 skb->len += copy;
1009 skb->data_len += copy;
1010 skb->truesize += copy;
1011 sk->sk_wmem_queued += copy;
1012 sk_mem_charge(sk, copy);
1013 skb->ip_summed = CHECKSUM_PARTIAL;
1014 tp->write_seq += copy;
1015 TCP_SKB_CB(skb)->end_seq += copy;
1016 tcp_skb_pcount_set(skb, 0);
1017
1018 if (!copied)
1019 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1020
1021 copied += copy;
1022 offset += copy;
1023 size -= copy;
1024 if (!size)
1025 goto out;
1026
1027 if (skb->len < size_goal || (flags & MSG_OOB))
1028 continue;
1029
1030 if (forced_push(tp)) {
1031 tcp_mark_push(tp, skb);
1032 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1033 } else if (skb == tcp_send_head(sk))
1034 tcp_push_one(sk, mss_now);
1035 continue;
1036
1037wait_for_sndbuf:
1038 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1039wait_for_memory:
1040 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1041 TCP_NAGLE_PUSH, size_goal);
1042
1043 err = sk_stream_wait_memory(sk, &timeo);
1044 if (err != 0)
1045 goto do_error;
1046
1047 mss_now = tcp_send_mss(sk, &size_goal, flags);
1048 }
1049
1050out:
1051 if (copied) {
1052 tcp_tx_timestamp(sk, sk->sk_tsflags);
1053 if (!(flags & MSG_SENDPAGE_NOTLAST))
1054 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1055 }
1056 return copied;
1057
1058do_error:
1059 if (copied)
1060 goto out;
1061out_err:
1062
1063 if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
1064 err == -EAGAIN)) {
1065 sk->sk_write_space(sk);
1066 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1067 }
1068 return sk_stream_error(sk, flags, err);
1069}
1070EXPORT_SYMBOL_GPL(do_tcp_sendpages);
1071
1072int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
1073 size_t size, int flags)
1074{
1075 if (!(sk->sk_route_caps & NETIF_F_SG))
1076 return sock_no_sendpage_locked(sk, page, offset, size, flags);
1077
1078 tcp_rate_check_app_limited(sk);
1079
1080 return do_tcp_sendpages(sk, page, offset, size, flags);
1081}
1082EXPORT_SYMBOL_GPL(tcp_sendpage_locked);
1083
1084int tcp_sendpage(struct sock *sk, struct page *page, int offset,
1085 size_t size, int flags)
1086{
1087 int ret;
1088
1089 lock_sock(sk);
1090 ret = tcp_sendpage_locked(sk, page, offset, size, flags);
1091 release_sock(sk);
1092
1093 return ret;
1094}
1095EXPORT_SYMBOL(tcp_sendpage);
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107static int linear_payload_sz(bool first_skb)
1108{
1109 if (first_skb)
1110 return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
1111 return 0;
1112}
1113
1114static int select_size(bool first_skb, bool zc)
1115{
1116 if (zc)
1117 return 0;
1118 return linear_payload_sz(first_skb);
1119}
1120
1121void tcp_free_fastopen_req(struct tcp_sock *tp)
1122{
1123 if (tp->fastopen_req) {
1124 kfree(tp->fastopen_req);
1125 tp->fastopen_req = NULL;
1126 }
1127}
1128
1129static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1130 int *copied, size_t size)
1131{
1132 struct tcp_sock *tp = tcp_sk(sk);
1133 struct inet_sock *inet = inet_sk(sk);
1134 struct sockaddr *uaddr = msg->msg_name;
1135 int err, flags;
1136
1137 if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
1138 (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
1139 uaddr->sa_family == AF_UNSPEC))
1140 return -EOPNOTSUPP;
1141 if (tp->fastopen_req)
1142 return -EALREADY;
1143
1144 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1145 sk->sk_allocation);
1146 if (unlikely(!tp->fastopen_req))
1147 return -ENOBUFS;
1148 tp->fastopen_req->data = msg;
1149 tp->fastopen_req->size = size;
1150
1151 if (inet->defer_connect) {
1152 err = tcp_connect(sk);
1153
1154 if (err) {
1155 tcp_set_state(sk, TCP_CLOSE);
1156 inet->inet_dport = 0;
1157 sk->sk_route_caps = 0;
1158 }
1159 }
1160 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1161 err = __inet_stream_connect(sk->sk_socket, uaddr,
1162 msg->msg_namelen, flags, 1);
1163
1164
1165
1166 if (tp->fastopen_req) {
1167 *copied = tp->fastopen_req->copied;
1168 tcp_free_fastopen_req(tp);
1169 inet->defer_connect = 0;
1170 }
1171 return err;
1172}
1173
1174int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
1175{
1176 struct tcp_sock *tp = tcp_sk(sk);
1177 struct ubuf_info *uarg = NULL;
1178 struct sk_buff *skb;
1179 struct sockcm_cookie sockc;
1180 int flags, err, copied = 0;
1181 int mss_now = 0, size_goal, copied_syn = 0;
1182 bool process_backlog = false;
1183 bool zc = false;
1184 long timeo;
1185
1186 flags = msg->msg_flags;
1187
1188 if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
1189 if (sk->sk_state != TCP_ESTABLISHED) {
1190 err = -EINVAL;
1191 goto out_err;
1192 }
1193
1194 skb = tcp_write_queue_tail(sk);
1195 uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
1196 if (!uarg) {
1197 err = -ENOBUFS;
1198 goto out_err;
1199 }
1200
1201 zc = sk->sk_route_caps & NETIF_F_SG;
1202 if (!zc)
1203 uarg->zerocopy = 0;
1204 }
1205
1206 if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
1207 !tp->repair) {
1208 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
1209 if (err == -EINPROGRESS && copied_syn > 0)
1210 goto out;
1211 else if (err)
1212 goto out_err;
1213 }
1214
1215 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1216
1217 tcp_rate_check_app_limited(sk);
1218
1219
1220
1221
1222
1223 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1224 !tcp_passive_fastopen(sk)) {
1225 err = sk_stream_wait_connect(sk, &timeo);
1226 if (err != 0)
1227 goto do_error;
1228 }
1229
1230 if (unlikely(tp->repair)) {
1231 if (tp->repair_queue == TCP_RECV_QUEUE) {
1232 copied = tcp_send_rcvq(sk, msg, size);
1233 goto out_nopush;
1234 }
1235
1236 err = -EINVAL;
1237 if (tp->repair_queue == TCP_NO_QUEUE)
1238 goto out_err;
1239
1240
1241 }
1242
1243 sockcm_init(&sockc, sk);
1244 if (msg->msg_controllen) {
1245 err = sock_cmsg_send(sk, msg, &sockc);
1246 if (unlikely(err)) {
1247 err = -EINVAL;
1248 goto out_err;
1249 }
1250 }
1251
1252
1253 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1254
1255
1256 copied = 0;
1257
1258restart:
1259 mss_now = tcp_send_mss(sk, &size_goal, flags);
1260
1261 err = -EPIPE;
1262 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1263 goto do_error;
1264
1265 while (msg_data_left(msg)) {
1266 int copy = 0;
1267
1268 skb = tcp_write_queue_tail(sk);
1269 if (skb)
1270 copy = size_goal - skb->len;
1271
1272 if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
1273 bool first_skb;
1274 int linear;
1275
1276new_segment:
1277 if (!sk_stream_memory_free(sk))
1278 goto wait_for_sndbuf;
1279
1280 if (process_backlog && sk_flush_backlog(sk)) {
1281 process_backlog = false;
1282 goto restart;
1283 }
1284 first_skb = tcp_rtx_and_write_queues_empty(sk);
1285 linear = select_size(first_skb, zc);
1286 skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation,
1287 first_skb);
1288 if (!skb)
1289 goto wait_for_memory;
1290
1291 process_backlog = true;
1292 skb->ip_summed = CHECKSUM_PARTIAL;
1293
1294 skb_entail(sk, skb);
1295 copy = size_goal;
1296
1297
1298
1299
1300
1301 if (tp->repair)
1302 TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
1303 }
1304
1305
1306 if (copy > msg_data_left(msg))
1307 copy = msg_data_left(msg);
1308
1309
1310 if (skb_availroom(skb) > 0 && !zc) {
1311
1312 copy = min_t(int, copy, skb_availroom(skb));
1313 err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
1314 if (err)
1315 goto do_fault;
1316 } else if (!zc) {
1317 bool merge = true;
1318 int i = skb_shinfo(skb)->nr_frags;
1319 struct page_frag *pfrag = sk_page_frag(sk);
1320
1321 if (!sk_page_frag_refill(sk, pfrag))
1322 goto wait_for_memory;
1323
1324 if (!skb_can_coalesce(skb, i, pfrag->page,
1325 pfrag->offset)) {
1326 if (i >= sysctl_max_skb_frags) {
1327 tcp_mark_push(tp, skb);
1328 goto new_segment;
1329 }
1330 merge = false;
1331 }
1332
1333 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1334
1335 if (!sk_wmem_schedule(sk, copy))
1336 goto wait_for_memory;
1337
1338 err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
1339 pfrag->page,
1340 pfrag->offset,
1341 copy);
1342 if (err)
1343 goto do_error;
1344
1345
1346 if (merge) {
1347 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1348 } else {
1349 skb_fill_page_desc(skb, i, pfrag->page,
1350 pfrag->offset, copy);
1351 page_ref_inc(pfrag->page);
1352 }
1353 pfrag->offset += copy;
1354 } else {
1355 err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
1356 if (err == -EMSGSIZE || err == -EEXIST) {
1357 tcp_mark_push(tp, skb);
1358 goto new_segment;
1359 }
1360 if (err < 0)
1361 goto do_error;
1362 copy = err;
1363 }
1364
1365 if (!copied)
1366 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1367
1368 tp->write_seq += copy;
1369 TCP_SKB_CB(skb)->end_seq += copy;
1370 tcp_skb_pcount_set(skb, 0);
1371
1372 copied += copy;
1373 if (!msg_data_left(msg)) {
1374 if (unlikely(flags & MSG_EOR))
1375 TCP_SKB_CB(skb)->eor = 1;
1376 goto out;
1377 }
1378
1379 if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair))
1380 continue;
1381
1382 if (forced_push(tp)) {
1383 tcp_mark_push(tp, skb);
1384 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1385 } else if (skb == tcp_send_head(sk))
1386 tcp_push_one(sk, mss_now);
1387 continue;
1388
1389wait_for_sndbuf:
1390 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1391wait_for_memory:
1392 if (copied)
1393 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1394 TCP_NAGLE_PUSH, size_goal);
1395
1396 err = sk_stream_wait_memory(sk, &timeo);
1397 if (err != 0)
1398 goto do_error;
1399
1400 mss_now = tcp_send_mss(sk, &size_goal, flags);
1401 }
1402
1403out:
1404 if (copied) {
1405 tcp_tx_timestamp(sk, sockc.tsflags);
1406 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1407 }
1408out_nopush:
1409 sock_zerocopy_put(uarg);
1410 return copied + copied_syn;
1411
1412do_fault:
1413 if (!skb->len) {
1414 tcp_unlink_write_queue(skb, sk);
1415
1416
1417
1418 tcp_check_send_head(sk, skb);
1419 sk_wmem_free_skb(sk, skb);
1420 }
1421
1422do_error:
1423 if (copied + copied_syn)
1424 goto out;
1425out_err:
1426 sock_zerocopy_put_abort(uarg);
1427 err = sk_stream_error(sk, flags, err);
1428
1429 if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
1430 err == -EAGAIN)) {
1431 sk->sk_write_space(sk);
1432 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1433 }
1434 return err;
1435}
1436EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
1437
1438int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
1439{
1440 int ret;
1441
1442 lock_sock(sk);
1443 ret = tcp_sendmsg_locked(sk, msg, size);
1444 release_sock(sk);
1445
1446 return ret;
1447}
1448EXPORT_SYMBOL(tcp_sendmsg);
1449
1450
1451
1452
1453
1454
1455static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1456{
1457 struct tcp_sock *tp = tcp_sk(sk);
1458
1459
1460 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1461 tp->urg_data == TCP_URG_READ)
1462 return -EINVAL;
1463
1464 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1465 return -ENOTCONN;
1466
1467 if (tp->urg_data & TCP_URG_VALID) {
1468 int err = 0;
1469 char c = tp->urg_data;
1470
1471 if (!(flags & MSG_PEEK))
1472 tp->urg_data = TCP_URG_READ;
1473
1474
1475 msg->msg_flags |= MSG_OOB;
1476
1477 if (len > 0) {
1478 if (!(flags & MSG_TRUNC))
1479 err = memcpy_to_msg(msg, &c, 1);
1480 len = 1;
1481 } else
1482 msg->msg_flags |= MSG_TRUNC;
1483
1484 return err ? -EFAULT : len;
1485 }
1486
1487 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1488 return 0;
1489
1490
1491
1492
1493
1494
1495
1496 return -EAGAIN;
1497}
1498
1499static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1500{
1501 struct sk_buff *skb;
1502 int copied = 0, err = 0;
1503
1504
1505
1506 skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
1507 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1508 if (err)
1509 return err;
1510 copied += skb->len;
1511 }
1512
1513 skb_queue_walk(&sk->sk_write_queue, skb) {
1514 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1515 if (err)
1516 break;
1517
1518 copied += skb->len;
1519 }
1520
1521 return err ?: copied;
1522}
1523
1524
1525
1526
1527
1528
1529
1530static void tcp_cleanup_rbuf(struct sock *sk, int copied)
1531{
1532 struct tcp_sock *tp = tcp_sk(sk);
1533 bool time_to_ack = false;
1534
1535 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1536
1537 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1538 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1539 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1540
1541 if (inet_csk_ack_scheduled(sk)) {
1542 const struct inet_connection_sock *icsk = inet_csk(sk);
1543
1544
1545 if (icsk->icsk_ack.blocked ||
1546
1547 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1548
1549
1550
1551
1552
1553
1554 (copied > 0 &&
1555 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1556 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1557 !icsk->icsk_ack.pingpong)) &&
1558 !atomic_read(&sk->sk_rmem_alloc)))
1559 time_to_ack = true;
1560 }
1561
1562
1563
1564
1565
1566
1567
1568 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1569 __u32 rcv_window_now = tcp_receive_window(tp);
1570
1571
1572 if (2*rcv_window_now <= tp->window_clamp) {
1573 __u32 new_window = __tcp_select_window(sk);
1574
1575
1576
1577
1578
1579
1580 if (new_window && new_window >= 2 * rcv_window_now)
1581 time_to_ack = true;
1582 }
1583 }
1584 if (time_to_ack)
1585 tcp_send_ack(sk);
1586}
1587
1588static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1589{
1590 struct sk_buff *skb;
1591 u32 offset;
1592
1593 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1594 offset = seq - TCP_SKB_CB(skb)->seq;
1595 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
1596 pr_err_once("%s: found a SYN, please report !\n", __func__);
1597 offset--;
1598 }
1599 if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
1600 *off = offset;
1601 return skb;
1602 }
1603
1604
1605
1606
1607 sk_eat_skb(sk, skb);
1608 }
1609 return NULL;
1610}
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1624 sk_read_actor_t recv_actor)
1625{
1626 struct sk_buff *skb;
1627 struct tcp_sock *tp = tcp_sk(sk);
1628 u32 seq = tp->copied_seq;
1629 u32 offset;
1630 int copied = 0;
1631
1632 if (sk->sk_state == TCP_LISTEN)
1633 return -ENOTCONN;
1634 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1635 if (offset < skb->len) {
1636 int used;
1637 size_t len;
1638
1639 len = skb->len - offset;
1640
1641 if (tp->urg_data) {
1642 u32 urg_offset = tp->urg_seq - seq;
1643 if (urg_offset < len)
1644 len = urg_offset;
1645 if (!len)
1646 break;
1647 }
1648 used = recv_actor(desc, skb, offset, len);
1649 if (used <= 0) {
1650 if (!copied)
1651 copied = used;
1652 break;
1653 } else if (used <= len) {
1654 seq += used;
1655 copied += used;
1656 offset += used;
1657 }
1658
1659
1660
1661
1662
1663 skb = tcp_recv_skb(sk, seq - 1, &offset);
1664 if (!skb)
1665 break;
1666
1667
1668
1669 if (offset + 1 != skb->len)
1670 continue;
1671 }
1672 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1673 sk_eat_skb(sk, skb);
1674 ++seq;
1675 break;
1676 }
1677 sk_eat_skb(sk, skb);
1678 if (!desc->count)
1679 break;
1680 tp->copied_seq = seq;
1681 }
1682 tp->copied_seq = seq;
1683
1684 tcp_rcv_space_adjust(sk);
1685
1686
1687 if (copied > 0) {
1688 tcp_recv_skb(sk, seq, &offset);
1689 tcp_cleanup_rbuf(sk, copied);
1690 }
1691 return copied;
1692}
1693EXPORT_SYMBOL(tcp_read_sock);
1694
1695int tcp_peek_len(struct socket *sock)
1696{
1697 return tcp_inq(sock->sk);
1698}
1699EXPORT_SYMBOL(tcp_peek_len);
1700
1701
1702int tcp_set_rcvlowat(struct sock *sk, int val)
1703{
1704 int cap;
1705
1706 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1707 cap = sk->sk_rcvbuf >> 1;
1708 else
1709 cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1;
1710 val = min(val, cap);
1711 sk->sk_rcvlowat = val ? : 1;
1712
1713
1714 tcp_data_ready(sk);
1715
1716 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1717 return 0;
1718
1719 val <<= 1;
1720 if (val > sk->sk_rcvbuf) {
1721 sk->sk_rcvbuf = val;
1722 tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
1723 }
1724 return 0;
1725}
1726EXPORT_SYMBOL(tcp_set_rcvlowat);
1727
1728#ifdef CONFIG_MMU
1729static const struct vm_operations_struct tcp_vm_ops = {
1730};
1731
1732int tcp_mmap(struct file *file, struct socket *sock,
1733 struct vm_area_struct *vma)
1734{
1735 if (vma->vm_flags & (VM_WRITE | VM_EXEC))
1736 return -EPERM;
1737 vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
1738
1739
1740 vma->vm_flags |= VM_MIXEDMAP;
1741
1742 vma->vm_ops = &tcp_vm_ops;
1743 return 0;
1744}
1745EXPORT_SYMBOL(tcp_mmap);
1746
1747static int tcp_zerocopy_receive(struct sock *sk,
1748 struct tcp_zerocopy_receive *zc)
1749{
1750 unsigned long address = (unsigned long)zc->address;
1751 const skb_frag_t *frags = NULL;
1752 u32 length = 0, seq, offset;
1753 struct vm_area_struct *vma;
1754 struct sk_buff *skb = NULL;
1755 struct tcp_sock *tp;
1756 int inq;
1757 int ret;
1758
1759 if (address & (PAGE_SIZE - 1) || address != zc->address)
1760 return -EINVAL;
1761
1762 if (sk->sk_state == TCP_LISTEN)
1763 return -ENOTCONN;
1764
1765 sock_rps_record_flow(sk);
1766
1767 down_read(¤t->mm->mmap_sem);
1768
1769 ret = -EINVAL;
1770 vma = find_vma(current->mm, address);
1771 if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops)
1772 goto out;
1773 zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
1774
1775 tp = tcp_sk(sk);
1776 seq = tp->copied_seq;
1777 inq = tcp_inq(sk);
1778 zc->length = min_t(u32, zc->length, inq);
1779 zc->length &= ~(PAGE_SIZE - 1);
1780 if (zc->length) {
1781 zap_page_range(vma, address, zc->length);
1782 zc->recv_skip_hint = 0;
1783 } else {
1784 zc->recv_skip_hint = inq;
1785 }
1786 ret = 0;
1787 while (length + PAGE_SIZE <= zc->length) {
1788 if (zc->recv_skip_hint < PAGE_SIZE) {
1789 if (skb) {
1790 skb = skb->next;
1791 offset = seq - TCP_SKB_CB(skb)->seq;
1792 } else {
1793 skb = tcp_recv_skb(sk, seq, &offset);
1794 }
1795
1796 zc->recv_skip_hint = skb->len - offset;
1797 offset -= skb_headlen(skb);
1798 if ((int)offset < 0 || skb_has_frag_list(skb))
1799 break;
1800 frags = skb_shinfo(skb)->frags;
1801 while (offset) {
1802 if (frags->size > offset)
1803 goto out;
1804 offset -= frags->size;
1805 frags++;
1806 }
1807 }
1808 if (frags->size != PAGE_SIZE || frags->page_offset) {
1809 int remaining = zc->recv_skip_hint;
1810
1811 while (remaining && (frags->size != PAGE_SIZE ||
1812 frags->page_offset)) {
1813 remaining -= frags->size;
1814 frags++;
1815 }
1816 zc->recv_skip_hint -= remaining;
1817 break;
1818 }
1819 ret = vm_insert_page(vma, address + length,
1820 skb_frag_page(frags));
1821 if (ret)
1822 break;
1823 length += PAGE_SIZE;
1824 seq += PAGE_SIZE;
1825 zc->recv_skip_hint -= PAGE_SIZE;
1826 frags++;
1827 }
1828out:
1829 up_read(¤t->mm->mmap_sem);
1830 if (length) {
1831 tp->copied_seq = seq;
1832 tcp_rcv_space_adjust(sk);
1833
1834
1835 tcp_recv_skb(sk, seq, &offset);
1836 tcp_cleanup_rbuf(sk, length);
1837 ret = 0;
1838 if (length == zc->length)
1839 zc->recv_skip_hint = 0;
1840 } else {
1841 if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
1842 ret = -EIO;
1843 }
1844 zc->length = length;
1845 return ret;
1846}
1847#endif
1848
1849static void tcp_update_recv_tstamps(struct sk_buff *skb,
1850 struct scm_timestamping *tss)
1851{
1852 if (skb->tstamp)
1853 tss->ts[0] = ktime_to_timespec(skb->tstamp);
1854 else
1855 tss->ts[0] = (struct timespec) {0};
1856
1857 if (skb_hwtstamps(skb)->hwtstamp)
1858 tss->ts[2] = ktime_to_timespec(skb_hwtstamps(skb)->hwtstamp);
1859 else
1860 tss->ts[2] = (struct timespec) {0};
1861}
1862
1863
1864static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
1865 struct scm_timestamping *tss)
1866{
1867 struct timeval tv;
1868 bool has_timestamping = false;
1869
1870 if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
1871 if (sock_flag(sk, SOCK_RCVTSTAMP)) {
1872 if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
1873 put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS,
1874 sizeof(tss->ts[0]), &tss->ts[0]);
1875 } else {
1876 tv.tv_sec = tss->ts[0].tv_sec;
1877 tv.tv_usec = tss->ts[0].tv_nsec / 1000;
1878
1879 put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
1880 sizeof(tv), &tv);
1881 }
1882 }
1883
1884 if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
1885 has_timestamping = true;
1886 else
1887 tss->ts[0] = (struct timespec) {0};
1888 }
1889
1890 if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
1891 if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
1892 has_timestamping = true;
1893 else
1894 tss->ts[2] = (struct timespec) {0};
1895 }
1896
1897 if (has_timestamping) {
1898 tss->ts[1] = (struct timespec) {0};
1899 put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING,
1900 sizeof(*tss), tss);
1901 }
1902}
1903
1904static int tcp_inq_hint(struct sock *sk)
1905{
1906 const struct tcp_sock *tp = tcp_sk(sk);
1907 u32 copied_seq = READ_ONCE(tp->copied_seq);
1908 u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
1909 int inq;
1910
1911 inq = rcv_nxt - copied_seq;
1912 if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
1913 lock_sock(sk);
1914 inq = tp->rcv_nxt - tp->copied_seq;
1915 release_sock(sk);
1916 }
1917 return inq;
1918}
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
1929 int flags, int *addr_len)
1930{
1931 struct tcp_sock *tp = tcp_sk(sk);
1932 int copied = 0;
1933 u32 peek_seq;
1934 u32 *seq;
1935 unsigned long used;
1936 int err, inq;
1937 int target;
1938 long timeo;
1939 struct sk_buff *skb, *last;
1940 u32 urg_hole = 0;
1941 struct scm_timestamping tss;
1942 bool has_tss = false;
1943 bool has_cmsg;
1944
1945 if (unlikely(flags & MSG_ERRQUEUE))
1946 return inet_recv_error(sk, msg, len, addr_len);
1947
1948 if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
1949 (sk->sk_state == TCP_ESTABLISHED))
1950 sk_busy_loop(sk, nonblock);
1951
1952 lock_sock(sk);
1953
1954 err = -ENOTCONN;
1955 if (sk->sk_state == TCP_LISTEN)
1956 goto out;
1957
1958 has_cmsg = tp->recvmsg_inq;
1959 timeo = sock_rcvtimeo(sk, nonblock);
1960
1961
1962 if (flags & MSG_OOB)
1963 goto recv_urg;
1964
1965 if (unlikely(tp->repair)) {
1966 err = -EPERM;
1967 if (!(flags & MSG_PEEK))
1968 goto out;
1969
1970 if (tp->repair_queue == TCP_SEND_QUEUE)
1971 goto recv_sndq;
1972
1973 err = -EINVAL;
1974 if (tp->repair_queue == TCP_NO_QUEUE)
1975 goto out;
1976
1977
1978 }
1979
1980 seq = &tp->copied_seq;
1981 if (flags & MSG_PEEK) {
1982 peek_seq = tp->copied_seq;
1983 seq = &peek_seq;
1984 }
1985
1986 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1987
1988 do {
1989 u32 offset;
1990
1991
1992 if (tp->urg_data && tp->urg_seq == *seq) {
1993 if (copied)
1994 break;
1995 if (signal_pending(current)) {
1996 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1997 break;
1998 }
1999 }
2000
2001
2002
2003 last = skb_peek_tail(&sk->sk_receive_queue);
2004 skb_queue_walk(&sk->sk_receive_queue, skb) {
2005 last = skb;
2006
2007
2008
2009 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
2010 "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
2011 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
2012 flags))
2013 break;
2014
2015 offset = *seq - TCP_SKB_CB(skb)->seq;
2016 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
2017 pr_err_once("%s: found a SYN, please report !\n", __func__);
2018 offset--;
2019 }
2020 if (offset < skb->len)
2021 goto found_ok_skb;
2022 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2023 goto found_fin_ok;
2024 WARN(!(flags & MSG_PEEK),
2025 "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
2026 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
2027 }
2028
2029
2030
2031 if (copied >= target && !sk->sk_backlog.tail)
2032 break;
2033
2034 if (copied) {
2035 if (sk->sk_err ||
2036 sk->sk_state == TCP_CLOSE ||
2037 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2038 !timeo ||
2039 signal_pending(current))
2040 break;
2041 } else {
2042 if (sock_flag(sk, SOCK_DONE))
2043 break;
2044
2045 if (sk->sk_err) {
2046 copied = sock_error(sk);
2047 break;
2048 }
2049
2050 if (sk->sk_shutdown & RCV_SHUTDOWN)
2051 break;
2052
2053 if (sk->sk_state == TCP_CLOSE) {
2054
2055
2056
2057 copied = -ENOTCONN;
2058 break;
2059 }
2060
2061 if (!timeo) {
2062 copied = -EAGAIN;
2063 break;
2064 }
2065
2066 if (signal_pending(current)) {
2067 copied = sock_intr_errno(timeo);
2068 break;
2069 }
2070 }
2071
2072 tcp_cleanup_rbuf(sk, copied);
2073
2074 if (copied >= target) {
2075
2076 release_sock(sk);
2077 lock_sock(sk);
2078 } else {
2079 sk_wait_data(sk, &timeo, last);
2080 }
2081
2082 if ((flags & MSG_PEEK) &&
2083 (peek_seq - copied - urg_hole != tp->copied_seq)) {
2084 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
2085 current->comm,
2086 task_pid_nr(current));
2087 peek_seq = tp->copied_seq;
2088 }
2089 continue;
2090
2091 found_ok_skb:
2092
2093 used = skb->len - offset;
2094 if (len < used)
2095 used = len;
2096
2097
2098 if (tp->urg_data) {
2099 u32 urg_offset = tp->urg_seq - *seq;
2100 if (urg_offset < used) {
2101 if (!urg_offset) {
2102 if (!sock_flag(sk, SOCK_URGINLINE)) {
2103 ++*seq;
2104 urg_hole++;
2105 offset++;
2106 used--;
2107 if (!used)
2108 goto skip_copy;
2109 }
2110 } else
2111 used = urg_offset;
2112 }
2113 }
2114
2115 if (!(flags & MSG_TRUNC)) {
2116 err = skb_copy_datagram_msg(skb, offset, msg, used);
2117 if (err) {
2118
2119 if (!copied)
2120 copied = -EFAULT;
2121 break;
2122 }
2123 }
2124
2125 *seq += used;
2126 copied += used;
2127 len -= used;
2128
2129 tcp_rcv_space_adjust(sk);
2130
2131skip_copy:
2132 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
2133 tp->urg_data = 0;
2134 tcp_fast_path_check(sk);
2135 }
2136 if (used + offset < skb->len)
2137 continue;
2138
2139 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2140 tcp_update_recv_tstamps(skb, &tss);
2141 has_tss = true;
2142 has_cmsg = true;
2143 }
2144 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2145 goto found_fin_ok;
2146 if (!(flags & MSG_PEEK))
2147 sk_eat_skb(sk, skb);
2148 continue;
2149
2150 found_fin_ok:
2151
2152 ++*seq;
2153 if (!(flags & MSG_PEEK))
2154 sk_eat_skb(sk, skb);
2155 break;
2156 } while (len > 0);
2157
2158
2159
2160
2161
2162
2163 tcp_cleanup_rbuf(sk, copied);
2164
2165 release_sock(sk);
2166
2167 if (has_cmsg) {
2168 if (has_tss)
2169 tcp_recv_timestamp(msg, sk, &tss);
2170 if (tp->recvmsg_inq) {
2171 inq = tcp_inq_hint(sk);
2172 put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
2173 }
2174 }
2175
2176 return copied;
2177
2178out:
2179 release_sock(sk);
2180 return err;
2181
2182recv_urg:
2183 err = tcp_recv_urg(sk, msg, len, flags);
2184 goto out;
2185
2186recv_sndq:
2187 err = tcp_peek_sndq(sk, msg, len);
2188 goto out;
2189}
2190EXPORT_SYMBOL(tcp_recvmsg);
2191
2192void tcp_set_state(struct sock *sk, int state)
2193{
2194 int oldstate = sk->sk_state;
2195
2196
2197
2198
2199
2200
2201
2202
2203 BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
2204 BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
2205 BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
2206 BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
2207 BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
2208 BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
2209 BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
2210 BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
2211 BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
2212 BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
2213 BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
2214 BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
2215 BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
2216
2217 if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
2218 tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
2219
2220 switch (state) {
2221 case TCP_ESTABLISHED:
2222 if (oldstate != TCP_ESTABLISHED)
2223 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2224 break;
2225
2226 case TCP_CLOSE:
2227 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
2228 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
2229
2230 sk->sk_prot->unhash(sk);
2231 if (inet_csk(sk)->icsk_bind_hash &&
2232 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
2233 inet_put_port(sk);
2234
2235 default:
2236 if (oldstate == TCP_ESTABLISHED)
2237 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2238 }
2239
2240
2241
2242
2243 inet_sk_state_store(sk, state);
2244
2245#ifdef STATE_TRACE
2246 SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
2247#endif
2248}
2249EXPORT_SYMBOL_GPL(tcp_set_state);
2250
2251
2252
2253
2254
2255
2256
2257
2258static const unsigned char new_state[16] = {
2259
2260 [0 ] = TCP_CLOSE,
2261 [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2262 [TCP_SYN_SENT] = TCP_CLOSE,
2263 [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2264 [TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
2265 [TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
2266 [TCP_TIME_WAIT] = TCP_CLOSE,
2267 [TCP_CLOSE] = TCP_CLOSE,
2268 [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
2269 [TCP_LAST_ACK] = TCP_LAST_ACK,
2270 [TCP_LISTEN] = TCP_CLOSE,
2271 [TCP_CLOSING] = TCP_CLOSING,
2272 [TCP_NEW_SYN_RECV] = TCP_CLOSE,
2273};
2274
2275static int tcp_close_state(struct sock *sk)
2276{
2277 int next = (int)new_state[sk->sk_state];
2278 int ns = next & TCP_STATE_MASK;
2279
2280 tcp_set_state(sk, ns);
2281
2282 return next & TCP_ACTION_FIN;
2283}
2284
2285
2286
2287
2288
2289
2290void tcp_shutdown(struct sock *sk, int how)
2291{
2292
2293
2294
2295
2296 if (!(how & SEND_SHUTDOWN))
2297 return;
2298
2299
2300 if ((1 << sk->sk_state) &
2301 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2302 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2303
2304 if (tcp_close_state(sk))
2305 tcp_send_fin(sk);
2306 }
2307}
2308EXPORT_SYMBOL(tcp_shutdown);
2309
2310bool tcp_check_oom(struct sock *sk, int shift)
2311{
2312 bool too_many_orphans, out_of_socket_memory;
2313
2314 too_many_orphans = tcp_too_many_orphans(sk, shift);
2315 out_of_socket_memory = tcp_out_of_memory(sk);
2316
2317 if (too_many_orphans)
2318 net_info_ratelimited("too many orphaned sockets\n");
2319 if (out_of_socket_memory)
2320 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2321 return too_many_orphans || out_of_socket_memory;
2322}
2323
2324void tcp_close(struct sock *sk, long timeout)
2325{
2326 struct sk_buff *skb;
2327 int data_was_unread = 0;
2328 int state;
2329
2330 lock_sock(sk);
2331 sk->sk_shutdown = SHUTDOWN_MASK;
2332
2333 if (sk->sk_state == TCP_LISTEN) {
2334 tcp_set_state(sk, TCP_CLOSE);
2335
2336
2337 inet_csk_listen_stop(sk);
2338
2339 goto adjudge_to_death;
2340 }
2341
2342
2343
2344
2345
2346 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2347 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
2348
2349 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2350 len--;
2351 data_was_unread += len;
2352 __kfree_skb(skb);
2353 }
2354
2355 sk_mem_reclaim(sk);
2356
2357
2358 if (sk->sk_state == TCP_CLOSE)
2359 goto adjudge_to_death;
2360
2361
2362
2363
2364
2365
2366
2367
2368 if (unlikely(tcp_sk(sk)->repair)) {
2369 sk->sk_prot->disconnect(sk, 0);
2370 } else if (data_was_unread) {
2371
2372 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2373 tcp_set_state(sk, TCP_CLOSE);
2374 tcp_send_active_reset(sk, sk->sk_allocation);
2375 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2376
2377 sk->sk_prot->disconnect(sk, 0);
2378 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2379 } else if (tcp_close_state(sk)) {
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409 tcp_send_fin(sk);
2410 }
2411
2412 sk_stream_wait_close(sk, timeout);
2413
2414adjudge_to_death:
2415 state = sk->sk_state;
2416 sock_hold(sk);
2417 sock_orphan(sk);
2418
2419 local_bh_disable();
2420 bh_lock_sock(sk);
2421
2422 __release_sock(sk);
2423
2424 percpu_counter_inc(sk->sk_prot->orphan_count);
2425
2426
2427 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2428 goto out;
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444 if (sk->sk_state == TCP_FIN_WAIT2) {
2445 struct tcp_sock *tp = tcp_sk(sk);
2446 if (tp->linger2 < 0) {
2447 tcp_set_state(sk, TCP_CLOSE);
2448 tcp_send_active_reset(sk, GFP_ATOMIC);
2449 __NET_INC_STATS(sock_net(sk),
2450 LINUX_MIB_TCPABORTONLINGER);
2451 } else {
2452 const int tmo = tcp_fin_time(sk);
2453
2454 if (tmo > TCP_TIMEWAIT_LEN) {
2455 inet_csk_reset_keepalive_timer(sk,
2456 tmo - TCP_TIMEWAIT_LEN);
2457 } else {
2458 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2459 goto out;
2460 }
2461 }
2462 }
2463 if (sk->sk_state != TCP_CLOSE) {
2464 sk_mem_reclaim(sk);
2465 if (tcp_check_oom(sk, 0)) {
2466 tcp_set_state(sk, TCP_CLOSE);
2467 tcp_send_active_reset(sk, GFP_ATOMIC);
2468 __NET_INC_STATS(sock_net(sk),
2469 LINUX_MIB_TCPABORTONMEMORY);
2470 } else if (!check_net(sock_net(sk))) {
2471
2472 tcp_set_state(sk, TCP_CLOSE);
2473 }
2474 }
2475
2476 if (sk->sk_state == TCP_CLOSE) {
2477 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2478
2479
2480
2481
2482 if (req)
2483 reqsk_fastopen_remove(sk, req, false);
2484 inet_csk_destroy_sock(sk);
2485 }
2486
2487
2488out:
2489 bh_unlock_sock(sk);
2490 local_bh_enable();
2491 release_sock(sk);
2492 sock_put(sk);
2493}
2494EXPORT_SYMBOL(tcp_close);
2495
2496
2497
2498static inline bool tcp_need_reset(int state)
2499{
2500 return (1 << state) &
2501 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2502 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2503}
2504
2505static void tcp_rtx_queue_purge(struct sock *sk)
2506{
2507 struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
2508
2509 while (p) {
2510 struct sk_buff *skb = rb_to_skb(p);
2511
2512 p = rb_next(p);
2513
2514
2515
2516 tcp_rtx_queue_unlink(skb, sk);
2517 sk_wmem_free_skb(sk, skb);
2518 }
2519}
2520
2521void tcp_write_queue_purge(struct sock *sk)
2522{
2523 struct sk_buff *skb;
2524
2525 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
2526 while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
2527 tcp_skb_tsorted_anchor_cleanup(skb);
2528 sk_wmem_free_skb(sk, skb);
2529 }
2530 tcp_rtx_queue_purge(sk);
2531 INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
2532 sk_mem_reclaim(sk);
2533 tcp_clear_all_retrans_hints(tcp_sk(sk));
2534 tcp_sk(sk)->packets_out = 0;
2535}
2536
2537int tcp_disconnect(struct sock *sk, int flags)
2538{
2539 struct inet_sock *inet = inet_sk(sk);
2540 struct inet_connection_sock *icsk = inet_csk(sk);
2541 struct tcp_sock *tp = tcp_sk(sk);
2542 int old_state = sk->sk_state;
2543
2544 if (old_state != TCP_CLOSE)
2545 tcp_set_state(sk, TCP_CLOSE);
2546
2547
2548 if (old_state == TCP_LISTEN) {
2549 inet_csk_listen_stop(sk);
2550 } else if (unlikely(tp->repair)) {
2551 sk->sk_err = ECONNABORTED;
2552 } else if (tcp_need_reset(old_state) ||
2553 (tp->snd_nxt != tp->write_seq &&
2554 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2555
2556
2557
2558 tcp_send_active_reset(sk, gfp_any());
2559 sk->sk_err = ECONNRESET;
2560 } else if (old_state == TCP_SYN_SENT)
2561 sk->sk_err = ECONNRESET;
2562
2563 tcp_clear_xmit_timers(sk);
2564 __skb_queue_purge(&sk->sk_receive_queue);
2565 tp->copied_seq = tp->rcv_nxt;
2566 tp->urg_data = 0;
2567 tcp_write_queue_purge(sk);
2568 tcp_fastopen_active_disable_ofo_check(sk);
2569 skb_rbtree_purge(&tp->out_of_order_queue);
2570
2571 inet->inet_dport = 0;
2572
2573 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2574 inet_reset_saddr(sk);
2575
2576 sk->sk_shutdown = 0;
2577 sock_reset_flag(sk, SOCK_DONE);
2578 tp->srtt_us = 0;
2579 tp->rcv_rtt_last_tsecr = 0;
2580 tp->write_seq += tp->max_window + 2;
2581 if (tp->write_seq == 0)
2582 tp->write_seq = 1;
2583 icsk->icsk_backoff = 0;
2584 tp->snd_cwnd = 2;
2585 icsk->icsk_probes_out = 0;
2586 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2587 tp->snd_cwnd_cnt = 0;
2588 tp->window_clamp = 0;
2589 tp->delivered_ce = 0;
2590 tcp_set_ca_state(sk, TCP_CA_Open);
2591 tp->is_sack_reneg = 0;
2592 tcp_clear_retrans(tp);
2593 inet_csk_delack_init(sk);
2594
2595
2596
2597 icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
2598 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2599 __sk_dst_reset(sk);
2600 dst_release(sk->sk_rx_dst);
2601 sk->sk_rx_dst = NULL;
2602 tcp_saved_syn_free(tp);
2603 tp->compressed_ack = 0;
2604 tp->bytes_sent = 0;
2605 tp->bytes_retrans = 0;
2606 tp->duplicate_sack[0].start_seq = 0;
2607 tp->duplicate_sack[0].end_seq = 0;
2608 tp->dsack_dups = 0;
2609 tp->reord_seen = 0;
2610
2611
2612 tcp_free_fastopen_req(tp);
2613 inet->defer_connect = 0;
2614
2615 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2616
2617 if (sk->sk_frag.page) {
2618 put_page(sk->sk_frag.page);
2619 sk->sk_frag.page = NULL;
2620 sk->sk_frag.offset = 0;
2621 }
2622
2623 sk->sk_error_report(sk);
2624 return 0;
2625}
2626EXPORT_SYMBOL(tcp_disconnect);
2627
2628static inline bool tcp_can_repair_sock(const struct sock *sk)
2629{
2630 return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
2631 (sk->sk_state != TCP_LISTEN);
2632}
2633
2634static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len)
2635{
2636 struct tcp_repair_window opt;
2637
2638 if (!tp->repair)
2639 return -EPERM;
2640
2641 if (len != sizeof(opt))
2642 return -EINVAL;
2643
2644 if (copy_from_user(&opt, optbuf, sizeof(opt)))
2645 return -EFAULT;
2646
2647 if (opt.max_window < opt.snd_wnd)
2648 return -EINVAL;
2649
2650 if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
2651 return -EINVAL;
2652
2653 if (after(opt.rcv_wup, tp->rcv_nxt))
2654 return -EINVAL;
2655
2656 tp->snd_wl1 = opt.snd_wl1;
2657 tp->snd_wnd = opt.snd_wnd;
2658 tp->max_window = opt.max_window;
2659
2660 tp->rcv_wnd = opt.rcv_wnd;
2661 tp->rcv_wup = opt.rcv_wup;
2662
2663 return 0;
2664}
2665
2666static int tcp_repair_options_est(struct sock *sk,
2667 struct tcp_repair_opt __user *optbuf, unsigned int len)
2668{
2669 struct tcp_sock *tp = tcp_sk(sk);
2670 struct tcp_repair_opt opt;
2671
2672 while (len >= sizeof(opt)) {
2673 if (copy_from_user(&opt, optbuf, sizeof(opt)))
2674 return -EFAULT;
2675
2676 optbuf++;
2677 len -= sizeof(opt);
2678
2679 switch (opt.opt_code) {
2680 case TCPOPT_MSS:
2681 tp->rx_opt.mss_clamp = opt.opt_val;
2682 tcp_mtup_init(sk);
2683 break;
2684 case TCPOPT_WINDOW:
2685 {
2686 u16 snd_wscale = opt.opt_val & 0xFFFF;
2687 u16 rcv_wscale = opt.opt_val >> 16;
2688
2689 if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE)
2690 return -EFBIG;
2691
2692 tp->rx_opt.snd_wscale = snd_wscale;
2693 tp->rx_opt.rcv_wscale = rcv_wscale;
2694 tp->rx_opt.wscale_ok = 1;
2695 }
2696 break;
2697 case TCPOPT_SACK_PERM:
2698 if (opt.opt_val != 0)
2699 return -EINVAL;
2700
2701 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2702 break;
2703 case TCPOPT_TIMESTAMP:
2704 if (opt.opt_val != 0)
2705 return -EINVAL;
2706
2707 tp->rx_opt.tstamp_ok = 1;
2708 break;
2709 }
2710 }
2711
2712 return 0;
2713}
2714
2715
2716
2717
2718static int do_tcp_setsockopt(struct sock *sk, int level,
2719 int optname, char __user *optval, unsigned int optlen)
2720{
2721 struct tcp_sock *tp = tcp_sk(sk);
2722 struct inet_connection_sock *icsk = inet_csk(sk);
2723 struct net *net = sock_net(sk);
2724 int val;
2725 int err = 0;
2726
2727
2728 switch (optname) {
2729 case TCP_CONGESTION: {
2730 char name[TCP_CA_NAME_MAX];
2731
2732 if (optlen < 1)
2733 return -EINVAL;
2734
2735 val = strncpy_from_user(name, optval,
2736 min_t(long, TCP_CA_NAME_MAX-1, optlen));
2737 if (val < 0)
2738 return -EFAULT;
2739 name[val] = 0;
2740
2741 lock_sock(sk);
2742 err = tcp_set_congestion_control(sk, name, true, true);
2743 release_sock(sk);
2744 return err;
2745 }
2746 case TCP_ULP: {
2747 char name[TCP_ULP_NAME_MAX];
2748
2749 if (optlen < 1)
2750 return -EINVAL;
2751
2752 val = strncpy_from_user(name, optval,
2753 min_t(long, TCP_ULP_NAME_MAX - 1,
2754 optlen));
2755 if (val < 0)
2756 return -EFAULT;
2757 name[val] = 0;
2758
2759 lock_sock(sk);
2760 err = tcp_set_ulp(sk, name);
2761 release_sock(sk);
2762 return err;
2763 }
2764 case TCP_FASTOPEN_KEY: {
2765 __u8 key[TCP_FASTOPEN_KEY_LENGTH];
2766
2767 if (optlen != sizeof(key))
2768 return -EINVAL;
2769
2770 if (copy_from_user(key, optval, optlen))
2771 return -EFAULT;
2772
2773 return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key));
2774 }
2775 default:
2776
2777 break;
2778 }
2779
2780 if (optlen < sizeof(int))
2781 return -EINVAL;
2782
2783 if (get_user(val, (int __user *)optval))
2784 return -EFAULT;
2785
2786 lock_sock(sk);
2787
2788 switch (optname) {
2789 case TCP_MAXSEG:
2790
2791
2792
2793
2794 if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) {
2795 err = -EINVAL;
2796 break;
2797 }
2798 tp->rx_opt.user_mss = val;
2799 break;
2800
2801 case TCP_NODELAY:
2802 if (val) {
2803
2804
2805
2806
2807
2808
2809
2810
2811 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2812 tcp_push_pending_frames(sk);
2813 } else {
2814 tp->nonagle &= ~TCP_NAGLE_OFF;
2815 }
2816 break;
2817
2818 case TCP_THIN_LINEAR_TIMEOUTS:
2819 if (val < 0 || val > 1)
2820 err = -EINVAL;
2821 else
2822 tp->thin_lto = val;
2823 break;
2824
2825 case TCP_THIN_DUPACK:
2826 if (val < 0 || val > 1)
2827 err = -EINVAL;
2828 break;
2829
2830 case TCP_REPAIR:
2831 if (!tcp_can_repair_sock(sk))
2832 err = -EPERM;
2833 else if (val == TCP_REPAIR_ON) {
2834 tp->repair = 1;
2835 sk->sk_reuse = SK_FORCE_REUSE;
2836 tp->repair_queue = TCP_NO_QUEUE;
2837 } else if (val == TCP_REPAIR_OFF) {
2838 tp->repair = 0;
2839 sk->sk_reuse = SK_NO_REUSE;
2840 tcp_send_window_probe(sk);
2841 } else if (val == TCP_REPAIR_OFF_NO_WP) {
2842 tp->repair = 0;
2843 sk->sk_reuse = SK_NO_REUSE;
2844 } else
2845 err = -EINVAL;
2846
2847 break;
2848
2849 case TCP_REPAIR_QUEUE:
2850 if (!tp->repair)
2851 err = -EPERM;
2852 else if ((unsigned int)val < TCP_QUEUES_NR)
2853 tp->repair_queue = val;
2854 else
2855 err = -EINVAL;
2856 break;
2857
2858 case TCP_QUEUE_SEQ:
2859 if (sk->sk_state != TCP_CLOSE)
2860 err = -EPERM;
2861 else if (tp->repair_queue == TCP_SEND_QUEUE)
2862 tp->write_seq = val;
2863 else if (tp->repair_queue == TCP_RECV_QUEUE)
2864 tp->rcv_nxt = val;
2865 else
2866 err = -EINVAL;
2867 break;
2868
2869 case TCP_REPAIR_OPTIONS:
2870 if (!tp->repair)
2871 err = -EINVAL;
2872 else if (sk->sk_state == TCP_ESTABLISHED)
2873 err = tcp_repair_options_est(sk,
2874 (struct tcp_repair_opt __user *)optval,
2875 optlen);
2876 else
2877 err = -EPERM;
2878 break;
2879
2880 case TCP_CORK:
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892 if (val) {
2893 tp->nonagle |= TCP_NAGLE_CORK;
2894 } else {
2895 tp->nonagle &= ~TCP_NAGLE_CORK;
2896 if (tp->nonagle&TCP_NAGLE_OFF)
2897 tp->nonagle |= TCP_NAGLE_PUSH;
2898 tcp_push_pending_frames(sk);
2899 }
2900 break;
2901
2902 case TCP_KEEPIDLE:
2903 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2904 err = -EINVAL;
2905 else {
2906 tp->keepalive_time = val * HZ;
2907 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2908 !((1 << sk->sk_state) &
2909 (TCPF_CLOSE | TCPF_LISTEN))) {
2910 u32 elapsed = keepalive_time_elapsed(tp);
2911 if (tp->keepalive_time > elapsed)
2912 elapsed = tp->keepalive_time - elapsed;
2913 else
2914 elapsed = 0;
2915 inet_csk_reset_keepalive_timer(sk, elapsed);
2916 }
2917 }
2918 break;
2919 case TCP_KEEPINTVL:
2920 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2921 err = -EINVAL;
2922 else
2923 tp->keepalive_intvl = val * HZ;
2924 break;
2925 case TCP_KEEPCNT:
2926 if (val < 1 || val > MAX_TCP_KEEPCNT)
2927 err = -EINVAL;
2928 else
2929 tp->keepalive_probes = val;
2930 break;
2931 case TCP_SYNCNT:
2932 if (val < 1 || val > MAX_TCP_SYNCNT)
2933 err = -EINVAL;
2934 else
2935 icsk->icsk_syn_retries = val;
2936 break;
2937
2938 case TCP_SAVE_SYN:
2939 if (val < 0 || val > 1)
2940 err = -EINVAL;
2941 else
2942 tp->save_syn = val;
2943 break;
2944
2945 case TCP_LINGER2:
2946 if (val < 0)
2947 tp->linger2 = -1;
2948 else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ)
2949 tp->linger2 = 0;
2950 else
2951 tp->linger2 = val * HZ;
2952 break;
2953
2954 case TCP_DEFER_ACCEPT:
2955
2956 icsk->icsk_accept_queue.rskq_defer_accept =
2957 secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
2958 TCP_RTO_MAX / HZ);
2959 break;
2960
2961 case TCP_WINDOW_CLAMP:
2962 if (!val) {
2963 if (sk->sk_state != TCP_CLOSE) {
2964 err = -EINVAL;
2965 break;
2966 }
2967 tp->window_clamp = 0;
2968 } else
2969 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2970 SOCK_MIN_RCVBUF / 2 : val;
2971 break;
2972
2973 case TCP_QUICKACK:
2974 if (!val) {
2975 icsk->icsk_ack.pingpong = 1;
2976 } else {
2977 icsk->icsk_ack.pingpong = 0;
2978 if ((1 << sk->sk_state) &
2979 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2980 inet_csk_ack_scheduled(sk)) {
2981 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2982 tcp_cleanup_rbuf(sk, 1);
2983 if (!(val & 1))
2984 icsk->icsk_ack.pingpong = 1;
2985 }
2986 }
2987 break;
2988
2989#ifdef CONFIG_TCP_MD5SIG
2990 case TCP_MD5SIG:
2991 case TCP_MD5SIG_EXT:
2992 if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
2993 err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
2994 else
2995 err = -EINVAL;
2996 break;
2997#endif
2998 case TCP_USER_TIMEOUT:
2999
3000
3001
3002 if (val < 0)
3003 err = -EINVAL;
3004 else
3005 icsk->icsk_user_timeout = val;
3006 break;
3007
3008 case TCP_FASTOPEN:
3009 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
3010 TCPF_LISTEN))) {
3011 tcp_fastopen_init_key_once(net);
3012
3013 fastopen_queue_tune(sk, val);
3014 } else {
3015 err = -EINVAL;
3016 }
3017 break;
3018 case TCP_FASTOPEN_CONNECT:
3019 if (val > 1 || val < 0) {
3020 err = -EINVAL;
3021 } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
3022 if (sk->sk_state == TCP_CLOSE)
3023 tp->fastopen_connect = val;
3024 else
3025 err = -EINVAL;
3026 } else {
3027 err = -EOPNOTSUPP;
3028 }
3029 break;
3030 case TCP_FASTOPEN_NO_COOKIE:
3031 if (val > 1 || val < 0)
3032 err = -EINVAL;
3033 else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3034 err = -EINVAL;
3035 else
3036 tp->fastopen_no_cookie = val;
3037 break;
3038 case TCP_TIMESTAMP:
3039 if (!tp->repair)
3040 err = -EPERM;
3041 else
3042 tp->tsoffset = val - tcp_time_stamp_raw();
3043 break;
3044 case TCP_REPAIR_WINDOW:
3045 err = tcp_repair_set_window(tp, optval, optlen);
3046 break;
3047 case TCP_NOTSENT_LOWAT:
3048 tp->notsent_lowat = val;
3049 sk->sk_write_space(sk);
3050 break;
3051 case TCP_INQ:
3052 if (val > 1 || val < 0)
3053 err = -EINVAL;
3054 else
3055 tp->recvmsg_inq = val;
3056 break;
3057 default:
3058 err = -ENOPROTOOPT;
3059 break;
3060 }
3061
3062 release_sock(sk);
3063 return err;
3064}
3065
3066int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
3067 unsigned int optlen)
3068{
3069 const struct inet_connection_sock *icsk = inet_csk(sk);
3070
3071 if (level != SOL_TCP)
3072 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
3073 optval, optlen);
3074 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
3075}
3076EXPORT_SYMBOL(tcp_setsockopt);
3077
3078#ifdef CONFIG_COMPAT
3079int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
3080 char __user *optval, unsigned int optlen)
3081{
3082 if (level != SOL_TCP)
3083 return inet_csk_compat_setsockopt(sk, level, optname,
3084 optval, optlen);
3085 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
3086}
3087EXPORT_SYMBOL(compat_tcp_setsockopt);
3088#endif
3089
3090static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
3091 struct tcp_info *info)
3092{
3093 u64 stats[__TCP_CHRONO_MAX], total = 0;
3094 enum tcp_chrono i;
3095
3096 for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
3097 stats[i] = tp->chrono_stat[i - 1];
3098 if (i == tp->chrono_type)
3099 stats[i] += tcp_jiffies32 - tp->chrono_start;
3100 stats[i] *= USEC_PER_SEC / HZ;
3101 total += stats[i];
3102 }
3103
3104 info->tcpi_busy_time = total;
3105 info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
3106 info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
3107}
3108
3109
3110void tcp_get_info(struct sock *sk, struct tcp_info *info)
3111{
3112 const struct tcp_sock *tp = tcp_sk(sk);
3113 const struct inet_connection_sock *icsk = inet_csk(sk);
3114 unsigned long rate;
3115 u32 now;
3116 u64 rate64;
3117 bool slow;
3118
3119 memset(info, 0, sizeof(*info));
3120 if (sk->sk_type != SOCK_STREAM)
3121 return;
3122
3123 info->tcpi_state = inet_sk_state_load(sk);
3124
3125
3126 rate = READ_ONCE(sk->sk_pacing_rate);
3127 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3128 info->tcpi_pacing_rate = rate64;
3129
3130 rate = READ_ONCE(sk->sk_max_pacing_rate);
3131 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3132 info->tcpi_max_pacing_rate = rate64;
3133
3134 info->tcpi_reordering = tp->reordering;
3135 info->tcpi_snd_cwnd = tp->snd_cwnd;
3136
3137 if (info->tcpi_state == TCP_LISTEN) {
3138
3139
3140
3141
3142 info->tcpi_unacked = sk->sk_ack_backlog;
3143 info->tcpi_sacked = sk->sk_max_ack_backlog;
3144 return;
3145 }
3146
3147 slow = lock_sock_fast(sk);
3148
3149 info->tcpi_ca_state = icsk->icsk_ca_state;
3150 info->tcpi_retransmits = icsk->icsk_retransmits;
3151 info->tcpi_probes = icsk->icsk_probes_out;
3152 info->tcpi_backoff = icsk->icsk_backoff;
3153
3154 if (tp->rx_opt.tstamp_ok)
3155 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
3156 if (tcp_is_sack(tp))
3157 info->tcpi_options |= TCPI_OPT_SACK;
3158 if (tp->rx_opt.wscale_ok) {
3159 info->tcpi_options |= TCPI_OPT_WSCALE;
3160 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
3161 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
3162 }
3163
3164 if (tp->ecn_flags & TCP_ECN_OK)
3165 info->tcpi_options |= TCPI_OPT_ECN;
3166 if (tp->ecn_flags & TCP_ECN_SEEN)
3167 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
3168 if (tp->syn_data_acked)
3169 info->tcpi_options |= TCPI_OPT_SYN_DATA;
3170
3171 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
3172 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
3173 info->tcpi_snd_mss = tp->mss_cache;
3174 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
3175
3176 info->tcpi_unacked = tp->packets_out;
3177 info->tcpi_sacked = tp->sacked_out;
3178
3179 info->tcpi_lost = tp->lost_out;
3180 info->tcpi_retrans = tp->retrans_out;
3181
3182 now = tcp_jiffies32;
3183 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
3184 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
3185 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
3186
3187 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
3188 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
3189 info->tcpi_rtt = tp->srtt_us >> 3;
3190 info->tcpi_rttvar = tp->mdev_us >> 2;
3191 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
3192 info->tcpi_advmss = tp->advmss;
3193
3194 info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
3195 info->tcpi_rcv_space = tp->rcvq_space.space;
3196
3197 info->tcpi_total_retrans = tp->total_retrans;
3198
3199 info->tcpi_bytes_acked = tp->bytes_acked;
3200 info->tcpi_bytes_received = tp->bytes_received;
3201 info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
3202 tcp_get_info_chrono_stats(tp, info);
3203
3204 info->tcpi_segs_out = tp->segs_out;
3205 info->tcpi_segs_in = tp->segs_in;
3206
3207 info->tcpi_min_rtt = tcp_min_rtt(tp);
3208 info->tcpi_data_segs_in = tp->data_segs_in;
3209 info->tcpi_data_segs_out = tp->data_segs_out;
3210
3211 info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
3212 rate64 = tcp_compute_delivery_rate(tp);
3213 if (rate64)
3214 info->tcpi_delivery_rate = rate64;
3215 info->tcpi_delivered = tp->delivered;
3216 info->tcpi_delivered_ce = tp->delivered_ce;
3217 info->tcpi_bytes_sent = tp->bytes_sent;
3218 info->tcpi_bytes_retrans = tp->bytes_retrans;
3219 info->tcpi_dsack_dups = tp->dsack_dups;
3220 info->tcpi_reord_seen = tp->reord_seen;
3221 unlock_sock_fast(sk, slow);
3222}
3223EXPORT_SYMBOL_GPL(tcp_get_info);
3224
3225static size_t tcp_opt_stats_get_size(void)
3226{
3227 return
3228 nla_total_size_64bit(sizeof(u64)) +
3229 nla_total_size_64bit(sizeof(u64)) +
3230 nla_total_size_64bit(sizeof(u64)) +
3231 nla_total_size_64bit(sizeof(u64)) +
3232 nla_total_size_64bit(sizeof(u64)) +
3233 nla_total_size_64bit(sizeof(u64)) +
3234 nla_total_size_64bit(sizeof(u64)) +
3235 nla_total_size(sizeof(u32)) +
3236 nla_total_size(sizeof(u32)) +
3237 nla_total_size(sizeof(u32)) +
3238 nla_total_size(sizeof(u8)) +
3239 nla_total_size(sizeof(u8)) +
3240 nla_total_size(sizeof(u32)) +
3241 nla_total_size(sizeof(u8)) +
3242 nla_total_size(sizeof(u32)) +
3243 nla_total_size(sizeof(u32)) +
3244 nla_total_size(sizeof(u32)) +
3245 nla_total_size_64bit(sizeof(u64)) +
3246 nla_total_size_64bit(sizeof(u64)) +
3247 nla_total_size(sizeof(u32)) +
3248 nla_total_size(sizeof(u32)) +
3249 0;
3250}
3251
3252struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
3253{
3254 const struct tcp_sock *tp = tcp_sk(sk);
3255 struct sk_buff *stats;
3256 struct tcp_info info;
3257 unsigned long rate;
3258 u64 rate64;
3259
3260 stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
3261 if (!stats)
3262 return NULL;
3263
3264 tcp_get_info_chrono_stats(tp, &info);
3265 nla_put_u64_64bit(stats, TCP_NLA_BUSY,
3266 info.tcpi_busy_time, TCP_NLA_PAD);
3267 nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
3268 info.tcpi_rwnd_limited, TCP_NLA_PAD);
3269 nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
3270 info.tcpi_sndbuf_limited, TCP_NLA_PAD);
3271 nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
3272 tp->data_segs_out, TCP_NLA_PAD);
3273 nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
3274 tp->total_retrans, TCP_NLA_PAD);
3275
3276 rate = READ_ONCE(sk->sk_pacing_rate);
3277 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3278 nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
3279
3280 rate64 = tcp_compute_delivery_rate(tp);
3281 nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
3282
3283 nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
3284 nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
3285 nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
3286
3287 nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
3288 nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
3289 nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
3290 nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
3291 nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
3292
3293 nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
3294 nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
3295
3296 nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
3297 TCP_NLA_PAD);
3298 nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
3299 TCP_NLA_PAD);
3300 nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
3301 nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
3302
3303 return stats;
3304}
3305
3306static int do_tcp_getsockopt(struct sock *sk, int level,
3307 int optname, char __user *optval, int __user *optlen)
3308{
3309 struct inet_connection_sock *icsk = inet_csk(sk);
3310 struct tcp_sock *tp = tcp_sk(sk);
3311 struct net *net = sock_net(sk);
3312 int val, len;
3313
3314 if (get_user(len, optlen))
3315 return -EFAULT;
3316
3317 len = min_t(unsigned int, len, sizeof(int));
3318
3319 if (len < 0)
3320 return -EINVAL;
3321
3322 switch (optname) {
3323 case TCP_MAXSEG:
3324 val = tp->mss_cache;
3325 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3326 val = tp->rx_opt.user_mss;
3327 if (tp->repair)
3328 val = tp->rx_opt.mss_clamp;
3329 break;
3330 case TCP_NODELAY:
3331 val = !!(tp->nonagle&TCP_NAGLE_OFF);
3332 break;
3333 case TCP_CORK:
3334 val = !!(tp->nonagle&TCP_NAGLE_CORK);
3335 break;
3336 case TCP_KEEPIDLE:
3337 val = keepalive_time_when(tp) / HZ;
3338 break;
3339 case TCP_KEEPINTVL:
3340 val = keepalive_intvl_when(tp) / HZ;
3341 break;
3342 case TCP_KEEPCNT:
3343 val = keepalive_probes(tp);
3344 break;
3345 case TCP_SYNCNT:
3346 val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
3347 break;
3348 case TCP_LINGER2:
3349 val = tp->linger2;
3350 if (val >= 0)
3351 val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
3352 break;
3353 case TCP_DEFER_ACCEPT:
3354 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
3355 TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
3356 break;
3357 case TCP_WINDOW_CLAMP:
3358 val = tp->window_clamp;
3359 break;
3360 case TCP_INFO: {
3361 struct tcp_info info;
3362
3363 if (get_user(len, optlen))
3364 return -EFAULT;
3365
3366 tcp_get_info(sk, &info);
3367
3368 len = min_t(unsigned int, len, sizeof(info));
3369 if (put_user(len, optlen))
3370 return -EFAULT;
3371 if (copy_to_user(optval, &info, len))
3372 return -EFAULT;
3373 return 0;
3374 }
3375 case TCP_CC_INFO: {
3376 const struct tcp_congestion_ops *ca_ops;
3377 union tcp_cc_info info;
3378 size_t sz = 0;
3379 int attr;
3380
3381 if (get_user(len, optlen))
3382 return -EFAULT;
3383
3384 ca_ops = icsk->icsk_ca_ops;
3385 if (ca_ops && ca_ops->get_info)
3386 sz = ca_ops->get_info(sk, ~0U, &attr, &info);
3387
3388 len = min_t(unsigned int, len, sz);
3389 if (put_user(len, optlen))
3390 return -EFAULT;
3391 if (copy_to_user(optval, &info, len))
3392 return -EFAULT;
3393 return 0;
3394 }
3395 case TCP_QUICKACK:
3396 val = !icsk->icsk_ack.pingpong;
3397 break;
3398
3399 case TCP_CONGESTION:
3400 if (get_user(len, optlen))
3401 return -EFAULT;
3402 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
3403 if (put_user(len, optlen))
3404 return -EFAULT;
3405 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
3406 return -EFAULT;
3407 return 0;
3408
3409 case TCP_ULP:
3410 if (get_user(len, optlen))
3411 return -EFAULT;
3412 len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
3413 if (!icsk->icsk_ulp_ops) {
3414 if (put_user(0, optlen))
3415 return -EFAULT;
3416 return 0;
3417 }
3418 if (put_user(len, optlen))
3419 return -EFAULT;
3420 if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
3421 return -EFAULT;
3422 return 0;
3423
3424 case TCP_FASTOPEN_KEY: {
3425 __u8 key[TCP_FASTOPEN_KEY_LENGTH];
3426 struct tcp_fastopen_context *ctx;
3427
3428 if (get_user(len, optlen))
3429 return -EFAULT;
3430
3431 rcu_read_lock();
3432 ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx);
3433 if (ctx)
3434 memcpy(key, ctx->key, sizeof(key));
3435 else
3436 len = 0;
3437 rcu_read_unlock();
3438
3439 len = min_t(unsigned int, len, sizeof(key));
3440 if (put_user(len, optlen))
3441 return -EFAULT;
3442 if (copy_to_user(optval, key, len))
3443 return -EFAULT;
3444 return 0;
3445 }
3446 case TCP_THIN_LINEAR_TIMEOUTS:
3447 val = tp->thin_lto;
3448 break;
3449
3450 case TCP_THIN_DUPACK:
3451 val = 0;
3452 break;
3453
3454 case TCP_REPAIR:
3455 val = tp->repair;
3456 break;
3457
3458 case TCP_REPAIR_QUEUE:
3459 if (tp->repair)
3460 val = tp->repair_queue;
3461 else
3462 return -EINVAL;
3463 break;
3464
3465 case TCP_REPAIR_WINDOW: {
3466 struct tcp_repair_window opt;
3467
3468 if (get_user(len, optlen))
3469 return -EFAULT;
3470
3471 if (len != sizeof(opt))
3472 return -EINVAL;
3473
3474 if (!tp->repair)
3475 return -EPERM;
3476
3477 opt.snd_wl1 = tp->snd_wl1;
3478 opt.snd_wnd = tp->snd_wnd;
3479 opt.max_window = tp->max_window;
3480 opt.rcv_wnd = tp->rcv_wnd;
3481 opt.rcv_wup = tp->rcv_wup;
3482
3483 if (copy_to_user(optval, &opt, len))
3484 return -EFAULT;
3485 return 0;
3486 }
3487 case TCP_QUEUE_SEQ:
3488 if (tp->repair_queue == TCP_SEND_QUEUE)
3489 val = tp->write_seq;
3490 else if (tp->repair_queue == TCP_RECV_QUEUE)
3491 val = tp->rcv_nxt;
3492 else
3493 return -EINVAL;
3494 break;
3495
3496 case TCP_USER_TIMEOUT:
3497 val = icsk->icsk_user_timeout;
3498 break;
3499
3500 case TCP_FASTOPEN:
3501 val = icsk->icsk_accept_queue.fastopenq.max_qlen;
3502 break;
3503
3504 case TCP_FASTOPEN_CONNECT:
3505 val = tp->fastopen_connect;
3506 break;
3507
3508 case TCP_FASTOPEN_NO_COOKIE:
3509 val = tp->fastopen_no_cookie;
3510 break;
3511
3512 case TCP_TIMESTAMP:
3513 val = tcp_time_stamp_raw() + tp->tsoffset;
3514 break;
3515 case TCP_NOTSENT_LOWAT:
3516 val = tp->notsent_lowat;
3517 break;
3518 case TCP_INQ:
3519 val = tp->recvmsg_inq;
3520 break;
3521 case TCP_SAVE_SYN:
3522 val = tp->save_syn;
3523 break;
3524 case TCP_SAVED_SYN: {
3525 if (get_user(len, optlen))
3526 return -EFAULT;
3527
3528 lock_sock(sk);
3529 if (tp->saved_syn) {
3530 if (len < tp->saved_syn[0]) {
3531 if (put_user(tp->saved_syn[0], optlen)) {
3532 release_sock(sk);
3533 return -EFAULT;
3534 }
3535 release_sock(sk);
3536 return -EINVAL;
3537 }
3538 len = tp->saved_syn[0];
3539 if (put_user(len, optlen)) {
3540 release_sock(sk);
3541 return -EFAULT;
3542 }
3543 if (copy_to_user(optval, tp->saved_syn + 1, len)) {
3544 release_sock(sk);
3545 return -EFAULT;
3546 }
3547 tcp_saved_syn_free(tp);
3548 release_sock(sk);
3549 } else {
3550 release_sock(sk);
3551 len = 0;
3552 if (put_user(len, optlen))
3553 return -EFAULT;
3554 }
3555 return 0;
3556 }
3557#ifdef CONFIG_MMU
3558 case TCP_ZEROCOPY_RECEIVE: {
3559 struct tcp_zerocopy_receive zc;
3560 int err;
3561
3562 if (get_user(len, optlen))
3563 return -EFAULT;
3564 if (len != sizeof(zc))
3565 return -EINVAL;
3566 if (copy_from_user(&zc, optval, len))
3567 return -EFAULT;
3568 lock_sock(sk);
3569 err = tcp_zerocopy_receive(sk, &zc);
3570 release_sock(sk);
3571 if (!err && copy_to_user(optval, &zc, len))
3572 err = -EFAULT;
3573 return err;
3574 }
3575#endif
3576 default:
3577 return -ENOPROTOOPT;
3578 }
3579
3580 if (put_user(len, optlen))
3581 return -EFAULT;
3582 if (copy_to_user(optval, &val, len))
3583 return -EFAULT;
3584 return 0;
3585}
3586
3587int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
3588 int __user *optlen)
3589{
3590 struct inet_connection_sock *icsk = inet_csk(sk);
3591
3592 if (level != SOL_TCP)
3593 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
3594 optval, optlen);
3595 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
3596}
3597EXPORT_SYMBOL(tcp_getsockopt);
3598
3599#ifdef CONFIG_COMPAT
3600int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
3601 char __user *optval, int __user *optlen)
3602{
3603 if (level != SOL_TCP)
3604 return inet_csk_compat_getsockopt(sk, level, optname,
3605 optval, optlen);
3606 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
3607}
3608EXPORT_SYMBOL(compat_tcp_getsockopt);
3609#endif
3610
3611#ifdef CONFIG_TCP_MD5SIG
3612static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
3613static DEFINE_MUTEX(tcp_md5sig_mutex);
3614static bool tcp_md5sig_pool_populated = false;
3615
3616static void __tcp_alloc_md5sig_pool(void)
3617{
3618 struct crypto_ahash *hash;
3619 int cpu;
3620
3621 hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
3622 if (IS_ERR(hash))
3623 return;
3624
3625 for_each_possible_cpu(cpu) {
3626 void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
3627 struct ahash_request *req;
3628
3629 if (!scratch) {
3630 scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
3631 sizeof(struct tcphdr),
3632 GFP_KERNEL,
3633 cpu_to_node(cpu));
3634 if (!scratch)
3635 return;
3636 per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
3637 }
3638 if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
3639 continue;
3640
3641 req = ahash_request_alloc(hash, GFP_KERNEL);
3642 if (!req)
3643 return;
3644
3645 ahash_request_set_callback(req, 0, NULL, NULL);
3646
3647 per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
3648 }
3649
3650
3651
3652 smp_wmb();
3653 tcp_md5sig_pool_populated = true;
3654}
3655
3656bool tcp_alloc_md5sig_pool(void)
3657{
3658 if (unlikely(!tcp_md5sig_pool_populated)) {
3659 mutex_lock(&tcp_md5sig_mutex);
3660
3661 if (!tcp_md5sig_pool_populated)
3662 __tcp_alloc_md5sig_pool();
3663
3664 mutex_unlock(&tcp_md5sig_mutex);
3665 }
3666 return tcp_md5sig_pool_populated;
3667}
3668EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
3679{
3680 local_bh_disable();
3681
3682 if (tcp_md5sig_pool_populated) {
3683
3684 smp_rmb();
3685 return this_cpu_ptr(&tcp_md5sig_pool);
3686 }
3687 local_bh_enable();
3688 return NULL;
3689}
3690EXPORT_SYMBOL(tcp_get_md5sig_pool);
3691
3692int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3693 const struct sk_buff *skb, unsigned int header_len)
3694{
3695 struct scatterlist sg;
3696 const struct tcphdr *tp = tcp_hdr(skb);
3697 struct ahash_request *req = hp->md5_req;
3698 unsigned int i;
3699 const unsigned int head_data_len = skb_headlen(skb) > header_len ?
3700 skb_headlen(skb) - header_len : 0;
3701 const struct skb_shared_info *shi = skb_shinfo(skb);
3702 struct sk_buff *frag_iter;
3703
3704 sg_init_table(&sg, 1);
3705
3706 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
3707 ahash_request_set_crypt(req, &sg, NULL, head_data_len);
3708 if (crypto_ahash_update(req))
3709 return 1;
3710
3711 for (i = 0; i < shi->nr_frags; ++i) {
3712 const struct skb_frag_struct *f = &shi->frags[i];
3713 unsigned int offset = f->page_offset;
3714 struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
3715
3716 sg_set_page(&sg, page, skb_frag_size(f),
3717 offset_in_page(offset));
3718 ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
3719 if (crypto_ahash_update(req))
3720 return 1;
3721 }
3722
3723 skb_walk_frags(skb, frag_iter)
3724 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
3725 return 1;
3726
3727 return 0;
3728}
3729EXPORT_SYMBOL(tcp_md5_hash_skb_data);
3730
3731int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
3732{
3733 struct scatterlist sg;
3734
3735 sg_init_one(&sg, key->key, key->keylen);
3736 ahash_request_set_crypt(hp->md5_req, &sg, NULL, key->keylen);
3737 return crypto_ahash_update(hp->md5_req);
3738}
3739EXPORT_SYMBOL(tcp_md5_hash_key);
3740
3741#endif
3742
3743void tcp_done(struct sock *sk)
3744{
3745 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3746
3747 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3748 TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3749
3750 tcp_set_state(sk, TCP_CLOSE);
3751 tcp_clear_xmit_timers(sk);
3752 if (req)
3753 reqsk_fastopen_remove(sk, req, false);
3754
3755 sk->sk_shutdown = SHUTDOWN_MASK;
3756
3757 if (!sock_flag(sk, SOCK_DEAD))
3758 sk->sk_state_change(sk);
3759 else
3760 inet_csk_destroy_sock(sk);
3761}
3762EXPORT_SYMBOL_GPL(tcp_done);
3763
3764int tcp_abort(struct sock *sk, int err)
3765{
3766 if (!sk_fullsock(sk)) {
3767 if (sk->sk_state == TCP_NEW_SYN_RECV) {
3768 struct request_sock *req = inet_reqsk(sk);
3769
3770 local_bh_disable();
3771 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
3772 local_bh_enable();
3773 return 0;
3774 }
3775 return -EOPNOTSUPP;
3776 }
3777
3778
3779 lock_sock(sk);
3780
3781 if (sk->sk_state == TCP_LISTEN) {
3782 tcp_set_state(sk, TCP_CLOSE);
3783 inet_csk_listen_stop(sk);
3784 }
3785
3786
3787 local_bh_disable();
3788 bh_lock_sock(sk);
3789
3790 if (!sock_flag(sk, SOCK_DEAD)) {
3791 sk->sk_err = err;
3792
3793 smp_wmb();
3794 sk->sk_error_report(sk);
3795 if (tcp_need_reset(sk->sk_state))
3796 tcp_send_active_reset(sk, GFP_ATOMIC);
3797 tcp_done(sk);
3798 }
3799
3800 bh_unlock_sock(sk);
3801 local_bh_enable();
3802 tcp_write_queue_purge(sk);
3803 release_sock(sk);
3804 return 0;
3805}
3806EXPORT_SYMBOL_GPL(tcp_abort);
3807
3808extern struct tcp_congestion_ops tcp_reno;
3809
3810static __initdata unsigned long thash_entries;
3811static int __init set_thash_entries(char *str)
3812{
3813 ssize_t ret;
3814
3815 if (!str)
3816 return 0;
3817
3818 ret = kstrtoul(str, 0, &thash_entries);
3819 if (ret)
3820 return 0;
3821
3822 return 1;
3823}
3824__setup("thash_entries=", set_thash_entries);
3825
3826static void __init tcp_init_mem(void)
3827{
3828 unsigned long limit = nr_free_buffer_pages() / 16;
3829
3830 limit = max(limit, 128UL);
3831 sysctl_tcp_mem[0] = limit / 4 * 3;
3832 sysctl_tcp_mem[1] = limit;
3833 sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
3834}
3835
3836void __init tcp_init(void)
3837{
3838 int max_rshare, max_wshare, cnt;
3839 unsigned long limit;
3840 unsigned int i;
3841
3842 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
3843 FIELD_SIZEOF(struct sk_buff, cb));
3844
3845 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
3846 percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
3847 inet_hashinfo_init(&tcp_hashinfo);
3848 inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
3849 thash_entries, 21,
3850 0, 64 * 1024);
3851 tcp_hashinfo.bind_bucket_cachep =
3852 kmem_cache_create("tcp_bind_bucket",
3853 sizeof(struct inet_bind_bucket), 0,
3854 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3855
3856
3857
3858
3859
3860
3861 tcp_hashinfo.ehash =
3862 alloc_large_system_hash("TCP established",
3863 sizeof(struct inet_ehash_bucket),
3864 thash_entries,
3865 17,
3866 0,
3867 NULL,
3868 &tcp_hashinfo.ehash_mask,
3869 0,
3870 thash_entries ? 0 : 512 * 1024);
3871 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
3872 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
3873
3874 if (inet_ehash_locks_alloc(&tcp_hashinfo))
3875 panic("TCP: failed to alloc ehash_locks");
3876 tcp_hashinfo.bhash =
3877 alloc_large_system_hash("TCP bind",
3878 sizeof(struct inet_bind_hashbucket),
3879 tcp_hashinfo.ehash_mask + 1,
3880 17,
3881 0,
3882 &tcp_hashinfo.bhash_size,
3883 NULL,
3884 0,
3885 64 * 1024);
3886 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
3887 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
3888 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
3889 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
3890 }
3891
3892
3893 cnt = tcp_hashinfo.ehash_mask + 1;
3894 sysctl_tcp_max_orphans = cnt / 2;
3895
3896 tcp_init_mem();
3897
3898 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
3899 max_wshare = min(4UL*1024*1024, limit);
3900 max_rshare = min(6UL*1024*1024, limit);
3901
3902 init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
3903 init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
3904 init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
3905
3906 init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3907 init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
3908 init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
3909
3910 pr_info("Hash tables configured (established %u bind %u)\n",
3911 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3912
3913 tcp_v4_init();
3914 tcp_metrics_init();
3915 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
3916 tcp_tasklet_init();
3917}
3918