1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244#define pr_fmt(fmt) "TCP: " fmt
245
246#include <crypto/hash.h>
247#include <linux/kernel.h>
248#include <linux/module.h>
249#include <linux/types.h>
250#include <linux/fcntl.h>
251#include <linux/poll.h>
252#include <linux/inet_diag.h>
253#include <linux/init.h>
254#include <linux/fs.h>
255#include <linux/skbuff.h>
256#include <linux/scatterlist.h>
257#include <linux/splice.h>
258#include <linux/net.h>
259#include <linux/socket.h>
260#include <linux/random.h>
261#include <linux/memblock.h>
262#include <linux/highmem.h>
263#include <linux/swap.h>
264#include <linux/cache.h>
265#include <linux/err.h>
266#include <linux/time.h>
267#include <linux/slab.h>
268#include <linux/errqueue.h>
269#include <linux/static_key.h>
270
271#include <net/icmp.h>
272#include <net/inet_common.h>
273#include <net/tcp.h>
274#include <net/mptcp.h>
275#include <net/xfrm.h>
276#include <net/ip.h>
277#include <net/sock.h>
278
279#include <linux/uaccess.h>
280#include <asm/ioctls.h>
281#include <net/busy_poll.h>
282
283struct percpu_counter tcp_orphan_count;
284EXPORT_SYMBOL_GPL(tcp_orphan_count);
285
286long sysctl_tcp_mem[3] __read_mostly;
287EXPORT_SYMBOL(sysctl_tcp_mem);
288
289atomic_long_t tcp_memory_allocated;
290EXPORT_SYMBOL(tcp_memory_allocated);
291
292#if IS_ENABLED(CONFIG_SMC)
293DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
294EXPORT_SYMBOL(tcp_have_smc);
295#endif
296
297
298
299
300struct percpu_counter tcp_sockets_allocated;
301EXPORT_SYMBOL(tcp_sockets_allocated);
302
303
304
305
306struct tcp_splice_state {
307 struct pipe_inode_info *pipe;
308 size_t len;
309 unsigned int flags;
310};
311
312
313
314
315
316
317
318unsigned long tcp_memory_pressure __read_mostly;
319EXPORT_SYMBOL_GPL(tcp_memory_pressure);
320
321DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
322EXPORT_SYMBOL(tcp_rx_skb_cache_key);
323
324DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
325
326void tcp_enter_memory_pressure(struct sock *sk)
327{
328 unsigned long val;
329
330 if (READ_ONCE(tcp_memory_pressure))
331 return;
332 val = jiffies;
333
334 if (!val)
335 val--;
336 if (!cmpxchg(&tcp_memory_pressure, 0, val))
337 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
338}
339EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
340
341void tcp_leave_memory_pressure(struct sock *sk)
342{
343 unsigned long val;
344
345 if (!READ_ONCE(tcp_memory_pressure))
346 return;
347 val = xchg(&tcp_memory_pressure, 0);
348 if (val)
349 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
350 jiffies_to_msecs(jiffies - val));
351}
352EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
353
354
355static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
356{
357 u8 res = 0;
358
359 if (seconds > 0) {
360 int period = timeout;
361
362 res = 1;
363 while (seconds > period && res < 255) {
364 res++;
365 timeout <<= 1;
366 if (timeout > rto_max)
367 timeout = rto_max;
368 period += timeout;
369 }
370 }
371 return res;
372}
373
374
375static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
376{
377 int period = 0;
378
379 if (retrans > 0) {
380 period = timeout;
381 while (--retrans) {
382 timeout <<= 1;
383 if (timeout > rto_max)
384 timeout = rto_max;
385 period += timeout;
386 }
387 }
388 return period;
389}
390
391static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
392{
393 u32 rate = READ_ONCE(tp->rate_delivered);
394 u32 intv = READ_ONCE(tp->rate_interval_us);
395 u64 rate64 = 0;
396
397 if (rate && intv) {
398 rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
399 do_div(rate64, intv);
400 }
401 return rate64;
402}
403
404
405
406
407
408
409void tcp_init_sock(struct sock *sk)
410{
411 struct inet_connection_sock *icsk = inet_csk(sk);
412 struct tcp_sock *tp = tcp_sk(sk);
413
414 tp->out_of_order_queue = RB_ROOT;
415 sk->tcp_rtx_queue = RB_ROOT;
416 tcp_init_xmit_timers(sk);
417 INIT_LIST_HEAD(&tp->tsq_node);
418 INIT_LIST_HEAD(&tp->tsorted_sent_queue);
419
420 icsk->icsk_rto = TCP_TIMEOUT_INIT;
421 icsk->icsk_rto_min = TCP_RTO_MIN;
422 icsk->icsk_delack_max = TCP_DELACK_MAX;
423 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
424 minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
425
426
427
428
429
430
431 tp->snd_cwnd = TCP_INIT_CWND;
432
433
434 tp->app_limited = ~0U;
435
436
437
438
439 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
440 tp->snd_cwnd_clamp = ~0;
441 tp->mss_cache = TCP_MSS_DEFAULT;
442
443 tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
444 tcp_assign_congestion_control(sk);
445
446 tp->tsoffset = 0;
447 tp->rack.reo_wnd_steps = 1;
448
449 sk->sk_write_space = sk_stream_write_space;
450 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
451
452 icsk->icsk_sync_mss = tcp_sync_mss;
453
454 WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]);
455 WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
456
457 sk_sockets_allocated_inc(sk);
458 sk->sk_route_forced_caps = NETIF_F_GSO;
459}
460EXPORT_SYMBOL(tcp_init_sock);
461
462static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
463{
464 struct sk_buff *skb = tcp_write_queue_tail(sk);
465
466 if (tsflags && skb) {
467 struct skb_shared_info *shinfo = skb_shinfo(skb);
468 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
469
470 sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
471 if (tsflags & SOF_TIMESTAMPING_TX_ACK)
472 tcb->txstamp_ack = 1;
473 if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
474 shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
475 }
476}
477
478static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
479 int target, struct sock *sk)
480{
481 int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq);
482
483 if (avail > 0) {
484 if (avail >= target)
485 return true;
486 if (tcp_rmem_pressure(sk))
487 return true;
488 if (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss)
489 return true;
490 }
491 if (sk->sk_prot->stream_memory_read)
492 return sk->sk_prot->stream_memory_read(sk);
493 return false;
494}
495
496
497
498
499
500
501
502
503__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
504{
505 __poll_t mask;
506 struct sock *sk = sock->sk;
507 const struct tcp_sock *tp = tcp_sk(sk);
508 int state;
509
510 sock_poll_wait(file, sock, wait);
511
512 state = inet_sk_state_load(sk);
513 if (state == TCP_LISTEN)
514 return inet_csk_listen_poll(sk);
515
516
517
518
519
520
521 mask = 0;
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550 if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
551 mask |= EPOLLHUP;
552 if (sk->sk_shutdown & RCV_SHUTDOWN)
553 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
554
555
556 if (state != TCP_SYN_SENT &&
557 (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
558 int target = sock_rcvlowat(sk, 0, INT_MAX);
559
560 if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
561 !sock_flag(sk, SOCK_URGINLINE) &&
562 tp->urg_data)
563 target++;
564
565 if (tcp_stream_is_readable(tp, target, sk))
566 mask |= EPOLLIN | EPOLLRDNORM;
567
568 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
569 if (__sk_stream_is_writeable(sk, 1)) {
570 mask |= EPOLLOUT | EPOLLWRNORM;
571 } else {
572 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
573 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
574
575
576
577
578
579
580 smp_mb__after_atomic();
581 if (__sk_stream_is_writeable(sk, 1))
582 mask |= EPOLLOUT | EPOLLWRNORM;
583 }
584 } else
585 mask |= EPOLLOUT | EPOLLWRNORM;
586
587 if (tp->urg_data & TCP_URG_VALID)
588 mask |= EPOLLPRI;
589 } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
590
591
592
593
594 mask |= EPOLLOUT | EPOLLWRNORM;
595 }
596
597 smp_rmb();
598 if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
599 mask |= EPOLLERR;
600
601 return mask;
602}
603EXPORT_SYMBOL(tcp_poll);
604
605int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
606{
607 struct tcp_sock *tp = tcp_sk(sk);
608 int answ;
609 bool slow;
610
611 switch (cmd) {
612 case SIOCINQ:
613 if (sk->sk_state == TCP_LISTEN)
614 return -EINVAL;
615
616 slow = lock_sock_fast(sk);
617 answ = tcp_inq(sk);
618 unlock_sock_fast(sk, slow);
619 break;
620 case SIOCATMARK:
621 answ = tp->urg_data &&
622 READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
623 break;
624 case SIOCOUTQ:
625 if (sk->sk_state == TCP_LISTEN)
626 return -EINVAL;
627
628 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
629 answ = 0;
630 else
631 answ = READ_ONCE(tp->write_seq) - tp->snd_una;
632 break;
633 case SIOCOUTQNSD:
634 if (sk->sk_state == TCP_LISTEN)
635 return -EINVAL;
636
637 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
638 answ = 0;
639 else
640 answ = READ_ONCE(tp->write_seq) -
641 READ_ONCE(tp->snd_nxt);
642 break;
643 default:
644 return -ENOIOCTLCMD;
645 }
646
647 return put_user(answ, (int __user *)arg);
648}
649EXPORT_SYMBOL(tcp_ioctl);
650
651static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
652{
653 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
654 tp->pushed_seq = tp->write_seq;
655}
656
657static inline bool forced_push(const struct tcp_sock *tp)
658{
659 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
660}
661
662static void skb_entail(struct sock *sk, struct sk_buff *skb)
663{
664 struct tcp_sock *tp = tcp_sk(sk);
665 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
666
667 skb->csum = 0;
668 tcb->seq = tcb->end_seq = tp->write_seq;
669 tcb->tcp_flags = TCPHDR_ACK;
670 tcb->sacked = 0;
671 __skb_header_release(skb);
672 tcp_add_write_queue_tail(sk, skb);
673 sk_wmem_queued_add(sk, skb->truesize);
674 sk_mem_charge(sk, skb->truesize);
675 if (tp->nonagle & TCP_NAGLE_PUSH)
676 tp->nonagle &= ~TCP_NAGLE_PUSH;
677
678 tcp_slow_start_after_idle_check(sk);
679}
680
681static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
682{
683 if (flags & MSG_OOB)
684 tp->snd_up = tp->write_seq;
685}
686
687
688
689
690
691
692
693
694
695
696
697static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
698 int size_goal)
699{
700 return skb->len < size_goal &&
701 sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
702 !tcp_rtx_queue_empty(sk) &&
703 refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
704}
705
706void tcp_push(struct sock *sk, int flags, int mss_now,
707 int nonagle, int size_goal)
708{
709 struct tcp_sock *tp = tcp_sk(sk);
710 struct sk_buff *skb;
711
712 skb = tcp_write_queue_tail(sk);
713 if (!skb)
714 return;
715 if (!(flags & MSG_MORE) || forced_push(tp))
716 tcp_mark_push(tp, skb);
717
718 tcp_mark_urg(tp, flags);
719
720 if (tcp_should_autocork(sk, skb, size_goal)) {
721
722
723 if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
724 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
725 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
726 }
727
728
729
730 if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
731 return;
732 }
733
734 if (flags & MSG_MORE)
735 nonagle = TCP_NAGLE_CORK;
736
737 __tcp_push_pending_frames(sk, mss_now, nonagle);
738}
739
740static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
741 unsigned int offset, size_t len)
742{
743 struct tcp_splice_state *tss = rd_desc->arg.data;
744 int ret;
745
746 ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
747 min(rd_desc->count, len), tss->flags);
748 if (ret > 0)
749 rd_desc->count -= ret;
750 return ret;
751}
752
753static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
754{
755
756 read_descriptor_t rd_desc = {
757 .arg.data = tss,
758 .count = tss->len,
759 };
760
761 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
762}
763
764
765
766
767
768
769
770
771
772
773
774
775
776ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
777 struct pipe_inode_info *pipe, size_t len,
778 unsigned int flags)
779{
780 struct sock *sk = sock->sk;
781 struct tcp_splice_state tss = {
782 .pipe = pipe,
783 .len = len,
784 .flags = flags,
785 };
786 long timeo;
787 ssize_t spliced;
788 int ret;
789
790 sock_rps_record_flow(sk);
791
792
793
794 if (unlikely(*ppos))
795 return -ESPIPE;
796
797 ret = spliced = 0;
798
799 lock_sock(sk);
800
801 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
802 while (tss.len) {
803 ret = __tcp_splice_read(sk, &tss);
804 if (ret < 0)
805 break;
806 else if (!ret) {
807 if (spliced)
808 break;
809 if (sock_flag(sk, SOCK_DONE))
810 break;
811 if (sk->sk_err) {
812 ret = sock_error(sk);
813 break;
814 }
815 if (sk->sk_shutdown & RCV_SHUTDOWN)
816 break;
817 if (sk->sk_state == TCP_CLOSE) {
818
819
820
821
822 ret = -ENOTCONN;
823 break;
824 }
825 if (!timeo) {
826 ret = -EAGAIN;
827 break;
828 }
829
830
831
832
833 if (!skb_queue_empty(&sk->sk_receive_queue))
834 break;
835 sk_wait_data(sk, &timeo, NULL);
836 if (signal_pending(current)) {
837 ret = sock_intr_errno(timeo);
838 break;
839 }
840 continue;
841 }
842 tss.len -= ret;
843 spliced += ret;
844
845 if (!timeo)
846 break;
847 release_sock(sk);
848 lock_sock(sk);
849
850 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
851 (sk->sk_shutdown & RCV_SHUTDOWN) ||
852 signal_pending(current))
853 break;
854 }
855
856 release_sock(sk);
857
858 if (spliced)
859 return spliced;
860
861 return ret;
862}
863EXPORT_SYMBOL(tcp_splice_read);
864
865struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
866 bool force_schedule)
867{
868 struct sk_buff *skb;
869
870 if (likely(!size)) {
871 skb = sk->sk_tx_skb_cache;
872 if (skb) {
873 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
874 sk->sk_tx_skb_cache = NULL;
875 pskb_trim(skb, 0);
876 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
877 skb_shinfo(skb)->tx_flags = 0;
878 memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb));
879 return skb;
880 }
881 }
882
883 size = ALIGN(size, 4);
884
885 if (unlikely(tcp_under_memory_pressure(sk)))
886 sk_mem_reclaim_partial(sk);
887
888 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
889 if (likely(skb)) {
890 bool mem_scheduled;
891
892 if (force_schedule) {
893 mem_scheduled = true;
894 sk_forced_mem_schedule(sk, skb->truesize);
895 } else {
896 mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
897 }
898 if (likely(mem_scheduled)) {
899 skb_reserve(skb, sk->sk_prot->max_header);
900
901
902
903
904 skb->reserved_tailroom = skb->end - skb->tail - size;
905 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
906 return skb;
907 }
908 __kfree_skb(skb);
909 } else {
910 sk->sk_prot->enter_memory_pressure(sk);
911 sk_stream_moderate_sndbuf(sk);
912 }
913 return NULL;
914}
915
916static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
917 int large_allowed)
918{
919 struct tcp_sock *tp = tcp_sk(sk);
920 u32 new_size_goal, size_goal;
921
922 if (!large_allowed)
923 return mss_now;
924
925
926 new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
927 new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
928
929
930 size_goal = tp->gso_segs * mss_now;
931 if (unlikely(new_size_goal < size_goal ||
932 new_size_goal >= size_goal + mss_now)) {
933 tp->gso_segs = min_t(u16, new_size_goal / mss_now,
934 sk->sk_gso_max_segs);
935 size_goal = tp->gso_segs * mss_now;
936 }
937
938 return max(size_goal, mss_now);
939}
940
941int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
942{
943 int mss_now;
944
945 mss_now = tcp_current_mss(sk);
946 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
947
948 return mss_now;
949}
950
951
952
953
954
955
956
957static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
958{
959 if (skb && !skb->len) {
960 tcp_unlink_write_queue(skb, sk);
961 if (tcp_write_queue_empty(sk))
962 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
963 sk_wmem_free_skb(sk, skb);
964 }
965}
966
967ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
968 size_t size, int flags)
969{
970 struct tcp_sock *tp = tcp_sk(sk);
971 int mss_now, size_goal;
972 int err;
973 ssize_t copied;
974 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
975
976 if (IS_ENABLED(CONFIG_DEBUG_VM) &&
977 WARN_ONCE(!sendpage_ok(page),
978 "page must not be a Slab one and have page_count > 0"))
979 return -EINVAL;
980
981
982
983
984
985 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
986 !tcp_passive_fastopen(sk)) {
987 err = sk_stream_wait_connect(sk, &timeo);
988 if (err != 0)
989 goto out_err;
990 }
991
992 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
993
994 mss_now = tcp_send_mss(sk, &size_goal, flags);
995 copied = 0;
996
997 err = -EPIPE;
998 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
999 goto out_err;
1000
1001 while (size > 0) {
1002 struct sk_buff *skb = tcp_write_queue_tail(sk);
1003 int copy, i;
1004 bool can_coalesce;
1005
1006 if (!skb || (copy = size_goal - skb->len) <= 0 ||
1007 !tcp_skb_can_collapse_to(skb)) {
1008new_segment:
1009 if (!sk_stream_memory_free(sk))
1010 goto wait_for_space;
1011
1012 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
1013 tcp_rtx_and_write_queues_empty(sk));
1014 if (!skb)
1015 goto wait_for_space;
1016
1017#ifdef CONFIG_TLS_DEVICE
1018 skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
1019#endif
1020 skb_entail(sk, skb);
1021 copy = size_goal;
1022 }
1023
1024 if (copy > size)
1025 copy = size;
1026
1027 i = skb_shinfo(skb)->nr_frags;
1028 can_coalesce = skb_can_coalesce(skb, i, page, offset);
1029 if (!can_coalesce && i >= sysctl_max_skb_frags) {
1030 tcp_mark_push(tp, skb);
1031 goto new_segment;
1032 }
1033 if (!sk_wmem_schedule(sk, copy))
1034 goto wait_for_space;
1035
1036 if (can_coalesce) {
1037 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1038 } else {
1039 get_page(page);
1040 skb_fill_page_desc(skb, i, page, offset, copy);
1041 }
1042
1043 if (!(flags & MSG_NO_SHARED_FRAGS))
1044 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
1045
1046 skb->len += copy;
1047 skb->data_len += copy;
1048 skb->truesize += copy;
1049 sk_wmem_queued_add(sk, copy);
1050 sk_mem_charge(sk, copy);
1051 skb->ip_summed = CHECKSUM_PARTIAL;
1052 WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
1053 TCP_SKB_CB(skb)->end_seq += copy;
1054 tcp_skb_pcount_set(skb, 0);
1055
1056 if (!copied)
1057 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1058
1059 copied += copy;
1060 offset += copy;
1061 size -= copy;
1062 if (!size)
1063 goto out;
1064
1065 if (skb->len < size_goal || (flags & MSG_OOB))
1066 continue;
1067
1068 if (forced_push(tp)) {
1069 tcp_mark_push(tp, skb);
1070 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1071 } else if (skb == tcp_send_head(sk))
1072 tcp_push_one(sk, mss_now);
1073 continue;
1074
1075wait_for_space:
1076 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1077 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1078 TCP_NAGLE_PUSH, size_goal);
1079
1080 err = sk_stream_wait_memory(sk, &timeo);
1081 if (err != 0)
1082 goto do_error;
1083
1084 mss_now = tcp_send_mss(sk, &size_goal, flags);
1085 }
1086
1087out:
1088 if (copied) {
1089 tcp_tx_timestamp(sk, sk->sk_tsflags);
1090 if (!(flags & MSG_SENDPAGE_NOTLAST))
1091 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1092 }
1093 return copied;
1094
1095do_error:
1096 tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk));
1097 if (copied)
1098 goto out;
1099out_err:
1100
1101 if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
1102 sk->sk_write_space(sk);
1103 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1104 }
1105 return sk_stream_error(sk, flags, err);
1106}
1107EXPORT_SYMBOL_GPL(do_tcp_sendpages);
1108
1109int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
1110 size_t size, int flags)
1111{
1112 if (!(sk->sk_route_caps & NETIF_F_SG))
1113 return sock_no_sendpage_locked(sk, page, offset, size, flags);
1114
1115 tcp_rate_check_app_limited(sk);
1116
1117 return do_tcp_sendpages(sk, page, offset, size, flags);
1118}
1119EXPORT_SYMBOL_GPL(tcp_sendpage_locked);
1120
1121int tcp_sendpage(struct sock *sk, struct page *page, int offset,
1122 size_t size, int flags)
1123{
1124 int ret;
1125
1126 lock_sock(sk);
1127 ret = tcp_sendpage_locked(sk, page, offset, size, flags);
1128 release_sock(sk);
1129
1130 return ret;
1131}
1132EXPORT_SYMBOL(tcp_sendpage);
1133
1134void tcp_free_fastopen_req(struct tcp_sock *tp)
1135{
1136 if (tp->fastopen_req) {
1137 kfree(tp->fastopen_req);
1138 tp->fastopen_req = NULL;
1139 }
1140}
1141
1142static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1143 int *copied, size_t size,
1144 struct ubuf_info *uarg)
1145{
1146 struct tcp_sock *tp = tcp_sk(sk);
1147 struct inet_sock *inet = inet_sk(sk);
1148 struct sockaddr *uaddr = msg->msg_name;
1149 int err, flags;
1150
1151 if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
1152 (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
1153 uaddr->sa_family == AF_UNSPEC))
1154 return -EOPNOTSUPP;
1155 if (tp->fastopen_req)
1156 return -EALREADY;
1157
1158 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1159 sk->sk_allocation);
1160 if (unlikely(!tp->fastopen_req))
1161 return -ENOBUFS;
1162 tp->fastopen_req->data = msg;
1163 tp->fastopen_req->size = size;
1164 tp->fastopen_req->uarg = uarg;
1165
1166 if (inet->defer_connect) {
1167 err = tcp_connect(sk);
1168
1169 if (err) {
1170 tcp_set_state(sk, TCP_CLOSE);
1171 inet->inet_dport = 0;
1172 sk->sk_route_caps = 0;
1173 }
1174 }
1175 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1176 err = __inet_stream_connect(sk->sk_socket, uaddr,
1177 msg->msg_namelen, flags, 1);
1178
1179
1180
1181 if (tp->fastopen_req) {
1182 *copied = tp->fastopen_req->copied;
1183 tcp_free_fastopen_req(tp);
1184 inet->defer_connect = 0;
1185 }
1186 return err;
1187}
1188
1189int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
1190{
1191 struct tcp_sock *tp = tcp_sk(sk);
1192 struct ubuf_info *uarg = NULL;
1193 struct sk_buff *skb;
1194 struct sockcm_cookie sockc;
1195 int flags, err, copied = 0;
1196 int mss_now = 0, size_goal, copied_syn = 0;
1197 int process_backlog = 0;
1198 bool zc = false;
1199 long timeo;
1200
1201 flags = msg->msg_flags;
1202
1203 if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
1204 skb = tcp_write_queue_tail(sk);
1205 uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
1206 if (!uarg) {
1207 err = -ENOBUFS;
1208 goto out_err;
1209 }
1210
1211 zc = sk->sk_route_caps & NETIF_F_SG;
1212 if (!zc)
1213 uarg->zerocopy = 0;
1214 }
1215
1216 if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
1217 !tp->repair) {
1218 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
1219 if (err == -EINPROGRESS && copied_syn > 0)
1220 goto out;
1221 else if (err)
1222 goto out_err;
1223 }
1224
1225 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1226
1227 tcp_rate_check_app_limited(sk);
1228
1229
1230
1231
1232
1233 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1234 !tcp_passive_fastopen(sk)) {
1235 err = sk_stream_wait_connect(sk, &timeo);
1236 if (err != 0)
1237 goto do_error;
1238 }
1239
1240 if (unlikely(tp->repair)) {
1241 if (tp->repair_queue == TCP_RECV_QUEUE) {
1242 copied = tcp_send_rcvq(sk, msg, size);
1243 goto out_nopush;
1244 }
1245
1246 err = -EINVAL;
1247 if (tp->repair_queue == TCP_NO_QUEUE)
1248 goto out_err;
1249
1250
1251 }
1252
1253 sockcm_init(&sockc, sk);
1254 if (msg->msg_controllen) {
1255 err = sock_cmsg_send(sk, msg, &sockc);
1256 if (unlikely(err)) {
1257 err = -EINVAL;
1258 goto out_err;
1259 }
1260 }
1261
1262
1263 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1264
1265
1266 copied = 0;
1267
1268restart:
1269 mss_now = tcp_send_mss(sk, &size_goal, flags);
1270
1271 err = -EPIPE;
1272 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1273 goto do_error;
1274
1275 while (msg_data_left(msg)) {
1276 int copy = 0;
1277
1278 skb = tcp_write_queue_tail(sk);
1279 if (skb)
1280 copy = size_goal - skb->len;
1281
1282 if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
1283 bool first_skb;
1284
1285new_segment:
1286 if (!sk_stream_memory_free(sk))
1287 goto wait_for_space;
1288
1289 if (unlikely(process_backlog >= 16)) {
1290 process_backlog = 0;
1291 if (sk_flush_backlog(sk))
1292 goto restart;
1293 }
1294 first_skb = tcp_rtx_and_write_queues_empty(sk);
1295 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
1296 first_skb);
1297 if (!skb)
1298 goto wait_for_space;
1299
1300 process_backlog++;
1301 skb->ip_summed = CHECKSUM_PARTIAL;
1302
1303 skb_entail(sk, skb);
1304 copy = size_goal;
1305
1306
1307
1308
1309
1310 if (tp->repair)
1311 TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
1312 }
1313
1314
1315 if (copy > msg_data_left(msg))
1316 copy = msg_data_left(msg);
1317
1318
1319 if (skb_availroom(skb) > 0 && !zc) {
1320
1321 copy = min_t(int, copy, skb_availroom(skb));
1322 err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
1323 if (err)
1324 goto do_fault;
1325 } else if (!zc) {
1326 bool merge = true;
1327 int i = skb_shinfo(skb)->nr_frags;
1328 struct page_frag *pfrag = sk_page_frag(sk);
1329
1330 if (!sk_page_frag_refill(sk, pfrag))
1331 goto wait_for_space;
1332
1333 if (!skb_can_coalesce(skb, i, pfrag->page,
1334 pfrag->offset)) {
1335 if (i >= sysctl_max_skb_frags) {
1336 tcp_mark_push(tp, skb);
1337 goto new_segment;
1338 }
1339 merge = false;
1340 }
1341
1342 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1343
1344 if (!sk_wmem_schedule(sk, copy))
1345 goto wait_for_space;
1346
1347 err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
1348 pfrag->page,
1349 pfrag->offset,
1350 copy);
1351 if (err)
1352 goto do_error;
1353
1354
1355 if (merge) {
1356 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1357 } else {
1358 skb_fill_page_desc(skb, i, pfrag->page,
1359 pfrag->offset, copy);
1360 page_ref_inc(pfrag->page);
1361 }
1362 pfrag->offset += copy;
1363 } else {
1364 err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
1365 if (err == -EMSGSIZE || err == -EEXIST) {
1366 tcp_mark_push(tp, skb);
1367 goto new_segment;
1368 }
1369 if (err < 0)
1370 goto do_error;
1371 copy = err;
1372 }
1373
1374 if (!copied)
1375 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1376
1377 WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
1378 TCP_SKB_CB(skb)->end_seq += copy;
1379 tcp_skb_pcount_set(skb, 0);
1380
1381 copied += copy;
1382 if (!msg_data_left(msg)) {
1383 if (unlikely(flags & MSG_EOR))
1384 TCP_SKB_CB(skb)->eor = 1;
1385 goto out;
1386 }
1387
1388 if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair))
1389 continue;
1390
1391 if (forced_push(tp)) {
1392 tcp_mark_push(tp, skb);
1393 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1394 } else if (skb == tcp_send_head(sk))
1395 tcp_push_one(sk, mss_now);
1396 continue;
1397
1398wait_for_space:
1399 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1400 if (copied)
1401 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1402 TCP_NAGLE_PUSH, size_goal);
1403
1404 err = sk_stream_wait_memory(sk, &timeo);
1405 if (err != 0)
1406 goto do_error;
1407
1408 mss_now = tcp_send_mss(sk, &size_goal, flags);
1409 }
1410
1411out:
1412 if (copied) {
1413 tcp_tx_timestamp(sk, sockc.tsflags);
1414 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1415 }
1416out_nopush:
1417 sock_zerocopy_put(uarg);
1418 return copied + copied_syn;
1419
1420do_error:
1421 skb = tcp_write_queue_tail(sk);
1422do_fault:
1423 tcp_remove_empty_skb(sk, skb);
1424
1425 if (copied + copied_syn)
1426 goto out;
1427out_err:
1428 sock_zerocopy_put_abort(uarg, true);
1429 err = sk_stream_error(sk, flags, err);
1430
1431 if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
1432 sk->sk_write_space(sk);
1433 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1434 }
1435 return err;
1436}
1437EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
1438
1439int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
1440{
1441 int ret;
1442
1443 lock_sock(sk);
1444 ret = tcp_sendmsg_locked(sk, msg, size);
1445 release_sock(sk);
1446
1447 return ret;
1448}
1449EXPORT_SYMBOL(tcp_sendmsg);
1450
1451
1452
1453
1454
1455
1456static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1457{
1458 struct tcp_sock *tp = tcp_sk(sk);
1459
1460
1461 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1462 tp->urg_data == TCP_URG_READ)
1463 return -EINVAL;
1464
1465 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1466 return -ENOTCONN;
1467
1468 if (tp->urg_data & TCP_URG_VALID) {
1469 int err = 0;
1470 char c = tp->urg_data;
1471
1472 if (!(flags & MSG_PEEK))
1473 tp->urg_data = TCP_URG_READ;
1474
1475
1476 msg->msg_flags |= MSG_OOB;
1477
1478 if (len > 0) {
1479 if (!(flags & MSG_TRUNC))
1480 err = memcpy_to_msg(msg, &c, 1);
1481 len = 1;
1482 } else
1483 msg->msg_flags |= MSG_TRUNC;
1484
1485 return err ? -EFAULT : len;
1486 }
1487
1488 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1489 return 0;
1490
1491
1492
1493
1494
1495
1496
1497 return -EAGAIN;
1498}
1499
1500static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1501{
1502 struct sk_buff *skb;
1503 int copied = 0, err = 0;
1504
1505
1506
1507 skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
1508 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1509 if (err)
1510 return err;
1511 copied += skb->len;
1512 }
1513
1514 skb_queue_walk(&sk->sk_write_queue, skb) {
1515 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1516 if (err)
1517 break;
1518
1519 copied += skb->len;
1520 }
1521
1522 return err ?: copied;
1523}
1524
1525
1526
1527
1528
1529
1530
1531void tcp_cleanup_rbuf(struct sock *sk, int copied)
1532{
1533 struct tcp_sock *tp = tcp_sk(sk);
1534 bool time_to_ack = false;
1535
1536 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1537
1538 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1539 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1540 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1541
1542 if (inet_csk_ack_scheduled(sk)) {
1543 const struct inet_connection_sock *icsk = inet_csk(sk);
1544
1545 if (
1546 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1547
1548
1549
1550
1551
1552
1553 (copied > 0 &&
1554 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1555 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1556 !inet_csk_in_pingpong_mode(sk))) &&
1557 !atomic_read(&sk->sk_rmem_alloc)))
1558 time_to_ack = true;
1559 }
1560
1561
1562
1563
1564
1565
1566
1567 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1568 __u32 rcv_window_now = tcp_receive_window(tp);
1569
1570
1571 if (2*rcv_window_now <= tp->window_clamp) {
1572 __u32 new_window = __tcp_select_window(sk);
1573
1574
1575
1576
1577
1578
1579 if (new_window && new_window >= 2 * rcv_window_now)
1580 time_to_ack = true;
1581 }
1582 }
1583 if (time_to_ack)
1584 tcp_send_ack(sk);
1585}
1586
1587static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1588{
1589 struct sk_buff *skb;
1590 u32 offset;
1591
1592 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1593 offset = seq - TCP_SKB_CB(skb)->seq;
1594 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
1595 pr_err_once("%s: found a SYN, please report !\n", __func__);
1596 offset--;
1597 }
1598 if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
1599 *off = offset;
1600 return skb;
1601 }
1602
1603
1604
1605
1606 sk_eat_skb(sk, skb);
1607 }
1608 return NULL;
1609}
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1623 sk_read_actor_t recv_actor)
1624{
1625 struct sk_buff *skb;
1626 struct tcp_sock *tp = tcp_sk(sk);
1627 u32 seq = tp->copied_seq;
1628 u32 offset;
1629 int copied = 0;
1630
1631 if (sk->sk_state == TCP_LISTEN)
1632 return -ENOTCONN;
1633 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1634 if (offset < skb->len) {
1635 int used;
1636 size_t len;
1637
1638 len = skb->len - offset;
1639
1640 if (tp->urg_data) {
1641 u32 urg_offset = tp->urg_seq - seq;
1642 if (urg_offset < len)
1643 len = urg_offset;
1644 if (!len)
1645 break;
1646 }
1647 used = recv_actor(desc, skb, offset, len);
1648 if (used <= 0) {
1649 if (!copied)
1650 copied = used;
1651 break;
1652 } else if (used <= len) {
1653 seq += used;
1654 copied += used;
1655 offset += used;
1656 }
1657
1658
1659
1660
1661
1662 skb = tcp_recv_skb(sk, seq - 1, &offset);
1663 if (!skb)
1664 break;
1665
1666
1667
1668 if (offset + 1 != skb->len)
1669 continue;
1670 }
1671 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1672 sk_eat_skb(sk, skb);
1673 ++seq;
1674 break;
1675 }
1676 sk_eat_skb(sk, skb);
1677 if (!desc->count)
1678 break;
1679 WRITE_ONCE(tp->copied_seq, seq);
1680 }
1681 WRITE_ONCE(tp->copied_seq, seq);
1682
1683 tcp_rcv_space_adjust(sk);
1684
1685
1686 if (copied > 0) {
1687 tcp_recv_skb(sk, seq, &offset);
1688 tcp_cleanup_rbuf(sk, copied);
1689 }
1690 return copied;
1691}
1692EXPORT_SYMBOL(tcp_read_sock);
1693
1694int tcp_peek_len(struct socket *sock)
1695{
1696 return tcp_inq(sock->sk);
1697}
1698EXPORT_SYMBOL(tcp_peek_len);
1699
1700
1701int tcp_set_rcvlowat(struct sock *sk, int val)
1702{
1703 int cap;
1704
1705 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1706 cap = sk->sk_rcvbuf >> 1;
1707 else
1708 cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1;
1709 val = min(val, cap);
1710 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1711
1712
1713 tcp_data_ready(sk);
1714
1715 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1716 return 0;
1717
1718 val <<= 1;
1719 if (val > sk->sk_rcvbuf) {
1720 WRITE_ONCE(sk->sk_rcvbuf, val);
1721 tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
1722 }
1723 return 0;
1724}
1725EXPORT_SYMBOL(tcp_set_rcvlowat);
1726
1727#ifdef CONFIG_MMU
1728static const struct vm_operations_struct tcp_vm_ops = {
1729};
1730
1731int tcp_mmap(struct file *file, struct socket *sock,
1732 struct vm_area_struct *vma)
1733{
1734 if (vma->vm_flags & (VM_WRITE | VM_EXEC))
1735 return -EPERM;
1736 vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
1737
1738
1739 vma->vm_flags |= VM_MIXEDMAP;
1740
1741 vma->vm_ops = &tcp_vm_ops;
1742 return 0;
1743}
1744EXPORT_SYMBOL(tcp_mmap);
1745
1746static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
1747 struct page **pages,
1748 unsigned long pages_to_map,
1749 unsigned long *insert_addr,
1750 u32 *length_with_pending,
1751 u32 *seq,
1752 struct tcp_zerocopy_receive *zc)
1753{
1754 unsigned long pages_remaining = pages_to_map;
1755 int bytes_mapped;
1756 int ret;
1757
1758 ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining);
1759 bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining);
1760
1761
1762
1763 *seq += bytes_mapped;
1764 *insert_addr += bytes_mapped;
1765 if (ret) {
1766
1767
1768
1769 const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
1770 *length_with_pending -= bytes_not_mapped;
1771 zc->recv_skip_hint += bytes_not_mapped;
1772 }
1773 return ret;
1774}
1775
1776static int tcp_zerocopy_receive(struct sock *sk,
1777 struct tcp_zerocopy_receive *zc)
1778{
1779 unsigned long address = (unsigned long)zc->address;
1780 u32 length = 0, seq, offset, zap_len;
1781 #define PAGE_BATCH_SIZE 8
1782 struct page *pages[PAGE_BATCH_SIZE];
1783 const skb_frag_t *frags = NULL;
1784 struct vm_area_struct *vma;
1785 struct sk_buff *skb = NULL;
1786 unsigned long pg_idx = 0;
1787 unsigned long curr_addr;
1788 struct tcp_sock *tp;
1789 int inq;
1790 int ret;
1791
1792 if (address & (PAGE_SIZE - 1) || address != zc->address)
1793 return -EINVAL;
1794
1795 if (sk->sk_state == TCP_LISTEN)
1796 return -ENOTCONN;
1797
1798 sock_rps_record_flow(sk);
1799
1800 tp = tcp_sk(sk);
1801
1802 mmap_read_lock(current->mm);
1803
1804 vma = find_vma(current->mm, address);
1805 if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) {
1806 mmap_read_unlock(current->mm);
1807 return -EINVAL;
1808 }
1809 zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
1810
1811 seq = tp->copied_seq;
1812 inq = tcp_inq(sk);
1813 zc->length = min_t(u32, zc->length, inq);
1814 zap_len = zc->length & ~(PAGE_SIZE - 1);
1815 if (zap_len) {
1816 zap_page_range(vma, address, zap_len);
1817 zc->recv_skip_hint = 0;
1818 } else {
1819 zc->recv_skip_hint = zc->length;
1820 }
1821 ret = 0;
1822 curr_addr = address;
1823 while (length + PAGE_SIZE <= zc->length) {
1824 if (zc->recv_skip_hint < PAGE_SIZE) {
1825
1826 if (pg_idx) {
1827 ret = tcp_zerocopy_vm_insert_batch(vma, pages,
1828 pg_idx,
1829 &curr_addr,
1830 &length,
1831 &seq, zc);
1832 if (ret)
1833 goto out;
1834 pg_idx = 0;
1835 }
1836 if (skb) {
1837 if (zc->recv_skip_hint > 0)
1838 break;
1839 skb = skb->next;
1840 offset = seq - TCP_SKB_CB(skb)->seq;
1841 } else {
1842 skb = tcp_recv_skb(sk, seq, &offset);
1843 }
1844 zc->recv_skip_hint = skb->len - offset;
1845 offset -= skb_headlen(skb);
1846 if ((int)offset < 0 || skb_has_frag_list(skb))
1847 break;
1848 frags = skb_shinfo(skb)->frags;
1849 while (offset) {
1850 if (skb_frag_size(frags) > offset)
1851 goto out;
1852 offset -= skb_frag_size(frags);
1853 frags++;
1854 }
1855 }
1856 if (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags)) {
1857 int remaining = zc->recv_skip_hint;
1858
1859 while (remaining && (skb_frag_size(frags) != PAGE_SIZE ||
1860 skb_frag_off(frags))) {
1861 remaining -= skb_frag_size(frags);
1862 frags++;
1863 }
1864 zc->recv_skip_hint -= remaining;
1865 break;
1866 }
1867 pages[pg_idx] = skb_frag_page(frags);
1868 pg_idx++;
1869 length += PAGE_SIZE;
1870 zc->recv_skip_hint -= PAGE_SIZE;
1871 frags++;
1872 if (pg_idx == PAGE_BATCH_SIZE) {
1873 ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
1874 &curr_addr, &length,
1875 &seq, zc);
1876 if (ret)
1877 goto out;
1878 pg_idx = 0;
1879 }
1880 }
1881 if (pg_idx) {
1882 ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
1883 &curr_addr, &length, &seq,
1884 zc);
1885 }
1886out:
1887 mmap_read_unlock(current->mm);
1888 if (length) {
1889 WRITE_ONCE(tp->copied_seq, seq);
1890 tcp_rcv_space_adjust(sk);
1891
1892
1893 tcp_recv_skb(sk, seq, &offset);
1894 tcp_cleanup_rbuf(sk, length);
1895 ret = 0;
1896 if (length == zc->length)
1897 zc->recv_skip_hint = 0;
1898 } else {
1899 if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
1900 ret = -EIO;
1901 }
1902 zc->length = length;
1903 return ret;
1904}
1905#endif
1906
1907static void tcp_update_recv_tstamps(struct sk_buff *skb,
1908 struct scm_timestamping_internal *tss)
1909{
1910 if (skb->tstamp)
1911 tss->ts[0] = ktime_to_timespec64(skb->tstamp);
1912 else
1913 tss->ts[0] = (struct timespec64) {0};
1914
1915 if (skb_hwtstamps(skb)->hwtstamp)
1916 tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
1917 else
1918 tss->ts[2] = (struct timespec64) {0};
1919}
1920
1921
1922static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
1923 struct scm_timestamping_internal *tss)
1924{
1925 int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
1926 bool has_timestamping = false;
1927
1928 if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
1929 if (sock_flag(sk, SOCK_RCVTSTAMP)) {
1930 if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
1931 if (new_tstamp) {
1932 struct __kernel_timespec kts = {
1933 .tv_sec = tss->ts[0].tv_sec,
1934 .tv_nsec = tss->ts[0].tv_nsec,
1935 };
1936 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
1937 sizeof(kts), &kts);
1938 } else {
1939 struct __kernel_old_timespec ts_old = {
1940 .tv_sec = tss->ts[0].tv_sec,
1941 .tv_nsec = tss->ts[0].tv_nsec,
1942 };
1943 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
1944 sizeof(ts_old), &ts_old);
1945 }
1946 } else {
1947 if (new_tstamp) {
1948 struct __kernel_sock_timeval stv = {
1949 .tv_sec = tss->ts[0].tv_sec,
1950 .tv_usec = tss->ts[0].tv_nsec / 1000,
1951 };
1952 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
1953 sizeof(stv), &stv);
1954 } else {
1955 struct __kernel_old_timeval tv = {
1956 .tv_sec = tss->ts[0].tv_sec,
1957 .tv_usec = tss->ts[0].tv_nsec / 1000,
1958 };
1959 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
1960 sizeof(tv), &tv);
1961 }
1962 }
1963 }
1964
1965 if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
1966 has_timestamping = true;
1967 else
1968 tss->ts[0] = (struct timespec64) {0};
1969 }
1970
1971 if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
1972 if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
1973 has_timestamping = true;
1974 else
1975 tss->ts[2] = (struct timespec64) {0};
1976 }
1977
1978 if (has_timestamping) {
1979 tss->ts[1] = (struct timespec64) {0};
1980 if (sock_flag(sk, SOCK_TSTAMP_NEW))
1981 put_cmsg_scm_timestamping64(msg, tss);
1982 else
1983 put_cmsg_scm_timestamping(msg, tss);
1984 }
1985}
1986
1987static int tcp_inq_hint(struct sock *sk)
1988{
1989 const struct tcp_sock *tp = tcp_sk(sk);
1990 u32 copied_seq = READ_ONCE(tp->copied_seq);
1991 u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
1992 int inq;
1993
1994 inq = rcv_nxt - copied_seq;
1995 if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
1996 lock_sock(sk);
1997 inq = tp->rcv_nxt - tp->copied_seq;
1998 release_sock(sk);
1999 }
2000
2001
2002
2003 if (inq == 0 && sock_flag(sk, SOCK_DONE))
2004 inq = 1;
2005 return inq;
2006}
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
2017 int flags, int *addr_len)
2018{
2019 struct tcp_sock *tp = tcp_sk(sk);
2020 int copied = 0;
2021 u32 peek_seq;
2022 u32 *seq;
2023 unsigned long used;
2024 int err, inq;
2025 int target;
2026 long timeo;
2027 struct sk_buff *skb, *last;
2028 u32 urg_hole = 0;
2029 struct scm_timestamping_internal tss;
2030 int cmsg_flags;
2031
2032 if (unlikely(flags & MSG_ERRQUEUE))
2033 return inet_recv_error(sk, msg, len, addr_len);
2034
2035 if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) &&
2036 (sk->sk_state == TCP_ESTABLISHED))
2037 sk_busy_loop(sk, nonblock);
2038
2039 lock_sock(sk);
2040
2041 err = -ENOTCONN;
2042 if (sk->sk_state == TCP_LISTEN)
2043 goto out;
2044
2045 cmsg_flags = tp->recvmsg_inq ? 1 : 0;
2046 timeo = sock_rcvtimeo(sk, nonblock);
2047
2048
2049 if (flags & MSG_OOB)
2050 goto recv_urg;
2051
2052 if (unlikely(tp->repair)) {
2053 err = -EPERM;
2054 if (!(flags & MSG_PEEK))
2055 goto out;
2056
2057 if (tp->repair_queue == TCP_SEND_QUEUE)
2058 goto recv_sndq;
2059
2060 err = -EINVAL;
2061 if (tp->repair_queue == TCP_NO_QUEUE)
2062 goto out;
2063
2064
2065 }
2066
2067 seq = &tp->copied_seq;
2068 if (flags & MSG_PEEK) {
2069 peek_seq = tp->copied_seq;
2070 seq = &peek_seq;
2071 }
2072
2073 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
2074
2075 do {
2076 u32 offset;
2077
2078
2079 if (tp->urg_data && tp->urg_seq == *seq) {
2080 if (copied)
2081 break;
2082 if (signal_pending(current)) {
2083 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
2084 break;
2085 }
2086 }
2087
2088
2089
2090 last = skb_peek_tail(&sk->sk_receive_queue);
2091 skb_queue_walk(&sk->sk_receive_queue, skb) {
2092 last = skb;
2093
2094
2095
2096 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
2097 "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
2098 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
2099 flags))
2100 break;
2101
2102 offset = *seq - TCP_SKB_CB(skb)->seq;
2103 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
2104 pr_err_once("%s: found a SYN, please report !\n", __func__);
2105 offset--;
2106 }
2107 if (offset < skb->len)
2108 goto found_ok_skb;
2109 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2110 goto found_fin_ok;
2111 WARN(!(flags & MSG_PEEK),
2112 "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
2113 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
2114 }
2115
2116
2117
2118 if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
2119 break;
2120
2121 if (copied) {
2122 if (sk->sk_err ||
2123 sk->sk_state == TCP_CLOSE ||
2124 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2125 !timeo ||
2126 signal_pending(current))
2127 break;
2128 } else {
2129 if (sock_flag(sk, SOCK_DONE))
2130 break;
2131
2132 if (sk->sk_err) {
2133 copied = sock_error(sk);
2134 break;
2135 }
2136
2137 if (sk->sk_shutdown & RCV_SHUTDOWN)
2138 break;
2139
2140 if (sk->sk_state == TCP_CLOSE) {
2141
2142
2143
2144 copied = -ENOTCONN;
2145 break;
2146 }
2147
2148 if (!timeo) {
2149 copied = -EAGAIN;
2150 break;
2151 }
2152
2153 if (signal_pending(current)) {
2154 copied = sock_intr_errno(timeo);
2155 break;
2156 }
2157 }
2158
2159 tcp_cleanup_rbuf(sk, copied);
2160
2161 if (copied >= target) {
2162
2163 release_sock(sk);
2164 lock_sock(sk);
2165 } else {
2166 sk_wait_data(sk, &timeo, last);
2167 }
2168
2169 if ((flags & MSG_PEEK) &&
2170 (peek_seq - copied - urg_hole != tp->copied_seq)) {
2171 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
2172 current->comm,
2173 task_pid_nr(current));
2174 peek_seq = tp->copied_seq;
2175 }
2176 continue;
2177
2178found_ok_skb:
2179
2180 used = skb->len - offset;
2181 if (len < used)
2182 used = len;
2183
2184
2185 if (tp->urg_data) {
2186 u32 urg_offset = tp->urg_seq - *seq;
2187 if (urg_offset < used) {
2188 if (!urg_offset) {
2189 if (!sock_flag(sk, SOCK_URGINLINE)) {
2190 WRITE_ONCE(*seq, *seq + 1);
2191 urg_hole++;
2192 offset++;
2193 used--;
2194 if (!used)
2195 goto skip_copy;
2196 }
2197 } else
2198 used = urg_offset;
2199 }
2200 }
2201
2202 if (!(flags & MSG_TRUNC)) {
2203 err = skb_copy_datagram_msg(skb, offset, msg, used);
2204 if (err) {
2205
2206 if (!copied)
2207 copied = -EFAULT;
2208 break;
2209 }
2210 }
2211
2212 WRITE_ONCE(*seq, *seq + used);
2213 copied += used;
2214 len -= used;
2215
2216 tcp_rcv_space_adjust(sk);
2217
2218skip_copy:
2219 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
2220 tp->urg_data = 0;
2221 tcp_fast_path_check(sk);
2222 }
2223
2224 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2225 tcp_update_recv_tstamps(skb, &tss);
2226 cmsg_flags |= 2;
2227 }
2228
2229 if (used + offset < skb->len)
2230 continue;
2231
2232 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2233 goto found_fin_ok;
2234 if (!(flags & MSG_PEEK))
2235 sk_eat_skb(sk, skb);
2236 continue;
2237
2238found_fin_ok:
2239
2240 WRITE_ONCE(*seq, *seq + 1);
2241 if (!(flags & MSG_PEEK))
2242 sk_eat_skb(sk, skb);
2243 break;
2244 } while (len > 0);
2245
2246
2247
2248
2249
2250
2251 tcp_cleanup_rbuf(sk, copied);
2252
2253 release_sock(sk);
2254
2255 if (cmsg_flags) {
2256 if (cmsg_flags & 2)
2257 tcp_recv_timestamp(msg, sk, &tss);
2258 if (cmsg_flags & 1) {
2259 inq = tcp_inq_hint(sk);
2260 put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
2261 }
2262 }
2263
2264 return copied;
2265
2266out:
2267 release_sock(sk);
2268 return err;
2269
2270recv_urg:
2271 err = tcp_recv_urg(sk, msg, len, flags);
2272 goto out;
2273
2274recv_sndq:
2275 err = tcp_peek_sndq(sk, msg, len);
2276 goto out;
2277}
2278EXPORT_SYMBOL(tcp_recvmsg);
2279
2280void tcp_set_state(struct sock *sk, int state)
2281{
2282 int oldstate = sk->sk_state;
2283
2284
2285
2286
2287
2288
2289
2290
2291 BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
2292 BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
2293 BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
2294 BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
2295 BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
2296 BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
2297 BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
2298 BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
2299 BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
2300 BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
2301 BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
2302 BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
2303 BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
2304
2305 if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
2306 tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
2307
2308 switch (state) {
2309 case TCP_ESTABLISHED:
2310 if (oldstate != TCP_ESTABLISHED)
2311 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2312 break;
2313
2314 case TCP_CLOSE:
2315 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
2316 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
2317
2318 sk->sk_prot->unhash(sk);
2319 if (inet_csk(sk)->icsk_bind_hash &&
2320 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
2321 inet_put_port(sk);
2322 fallthrough;
2323 default:
2324 if (oldstate == TCP_ESTABLISHED)
2325 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2326 }
2327
2328
2329
2330
2331 inet_sk_state_store(sk, state);
2332}
2333EXPORT_SYMBOL_GPL(tcp_set_state);
2334
2335
2336
2337
2338
2339
2340
2341
2342static const unsigned char new_state[16] = {
2343
2344 [0 ] = TCP_CLOSE,
2345 [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2346 [TCP_SYN_SENT] = TCP_CLOSE,
2347 [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2348 [TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
2349 [TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
2350 [TCP_TIME_WAIT] = TCP_CLOSE,
2351 [TCP_CLOSE] = TCP_CLOSE,
2352 [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
2353 [TCP_LAST_ACK] = TCP_LAST_ACK,
2354 [TCP_LISTEN] = TCP_CLOSE,
2355 [TCP_CLOSING] = TCP_CLOSING,
2356 [TCP_NEW_SYN_RECV] = TCP_CLOSE,
2357};
2358
2359static int tcp_close_state(struct sock *sk)
2360{
2361 int next = (int)new_state[sk->sk_state];
2362 int ns = next & TCP_STATE_MASK;
2363
2364 tcp_set_state(sk, ns);
2365
2366 return next & TCP_ACTION_FIN;
2367}
2368
2369
2370
2371
2372
2373
2374void tcp_shutdown(struct sock *sk, int how)
2375{
2376
2377
2378
2379
2380 if (!(how & SEND_SHUTDOWN))
2381 return;
2382
2383
2384 if ((1 << sk->sk_state) &
2385 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2386 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2387
2388 if (tcp_close_state(sk))
2389 tcp_send_fin(sk);
2390 }
2391}
2392EXPORT_SYMBOL(tcp_shutdown);
2393
2394bool tcp_check_oom(struct sock *sk, int shift)
2395{
2396 bool too_many_orphans, out_of_socket_memory;
2397
2398 too_many_orphans = tcp_too_many_orphans(sk, shift);
2399 out_of_socket_memory = tcp_out_of_memory(sk);
2400
2401 if (too_many_orphans)
2402 net_info_ratelimited("too many orphaned sockets\n");
2403 if (out_of_socket_memory)
2404 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2405 return too_many_orphans || out_of_socket_memory;
2406}
2407
2408void tcp_close(struct sock *sk, long timeout)
2409{
2410 struct sk_buff *skb;
2411 int data_was_unread = 0;
2412 int state;
2413
2414 lock_sock(sk);
2415 sk->sk_shutdown = SHUTDOWN_MASK;
2416
2417 if (sk->sk_state == TCP_LISTEN) {
2418 tcp_set_state(sk, TCP_CLOSE);
2419
2420
2421 inet_csk_listen_stop(sk);
2422
2423 goto adjudge_to_death;
2424 }
2425
2426
2427
2428
2429
2430 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2431 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
2432
2433 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2434 len--;
2435 data_was_unread += len;
2436 __kfree_skb(skb);
2437 }
2438
2439 sk_mem_reclaim(sk);
2440
2441
2442 if (sk->sk_state == TCP_CLOSE)
2443 goto adjudge_to_death;
2444
2445
2446
2447
2448
2449
2450
2451
2452 if (unlikely(tcp_sk(sk)->repair)) {
2453 sk->sk_prot->disconnect(sk, 0);
2454 } else if (data_was_unread) {
2455
2456 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2457 tcp_set_state(sk, TCP_CLOSE);
2458 tcp_send_active_reset(sk, sk->sk_allocation);
2459 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2460
2461 sk->sk_prot->disconnect(sk, 0);
2462 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2463 } else if (tcp_close_state(sk)) {
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493 tcp_send_fin(sk);
2494 }
2495
2496 sk_stream_wait_close(sk, timeout);
2497
2498adjudge_to_death:
2499 state = sk->sk_state;
2500 sock_hold(sk);
2501 sock_orphan(sk);
2502
2503 local_bh_disable();
2504 bh_lock_sock(sk);
2505
2506 __release_sock(sk);
2507
2508 percpu_counter_inc(sk->sk_prot->orphan_count);
2509
2510
2511 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2512 goto out;
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528 if (sk->sk_state == TCP_FIN_WAIT2) {
2529 struct tcp_sock *tp = tcp_sk(sk);
2530 if (tp->linger2 < 0) {
2531 tcp_set_state(sk, TCP_CLOSE);
2532 tcp_send_active_reset(sk, GFP_ATOMIC);
2533 __NET_INC_STATS(sock_net(sk),
2534 LINUX_MIB_TCPABORTONLINGER);
2535 } else {
2536 const int tmo = tcp_fin_time(sk);
2537
2538 if (tmo > TCP_TIMEWAIT_LEN) {
2539 inet_csk_reset_keepalive_timer(sk,
2540 tmo - TCP_TIMEWAIT_LEN);
2541 } else {
2542 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2543 goto out;
2544 }
2545 }
2546 }
2547 if (sk->sk_state != TCP_CLOSE) {
2548 sk_mem_reclaim(sk);
2549 if (tcp_check_oom(sk, 0)) {
2550 tcp_set_state(sk, TCP_CLOSE);
2551 tcp_send_active_reset(sk, GFP_ATOMIC);
2552 __NET_INC_STATS(sock_net(sk),
2553 LINUX_MIB_TCPABORTONMEMORY);
2554 } else if (!check_net(sock_net(sk))) {
2555
2556 tcp_set_state(sk, TCP_CLOSE);
2557 }
2558 }
2559
2560 if (sk->sk_state == TCP_CLOSE) {
2561 struct request_sock *req;
2562
2563 req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
2564 lockdep_sock_is_held(sk));
2565
2566
2567
2568
2569 if (req)
2570 reqsk_fastopen_remove(sk, req, false);
2571 inet_csk_destroy_sock(sk);
2572 }
2573
2574
2575out:
2576 bh_unlock_sock(sk);
2577 local_bh_enable();
2578 release_sock(sk);
2579 sock_put(sk);
2580}
2581EXPORT_SYMBOL(tcp_close);
2582
2583
2584
2585static inline bool tcp_need_reset(int state)
2586{
2587 return (1 << state) &
2588 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2589 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2590}
2591
2592static void tcp_rtx_queue_purge(struct sock *sk)
2593{
2594 struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
2595
2596 tcp_sk(sk)->highest_sack = NULL;
2597 while (p) {
2598 struct sk_buff *skb = rb_to_skb(p);
2599
2600 p = rb_next(p);
2601
2602
2603
2604 tcp_rtx_queue_unlink(skb, sk);
2605 sk_wmem_free_skb(sk, skb);
2606 }
2607}
2608
2609void tcp_write_queue_purge(struct sock *sk)
2610{
2611 struct sk_buff *skb;
2612
2613 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
2614 while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
2615 tcp_skb_tsorted_anchor_cleanup(skb);
2616 sk_wmem_free_skb(sk, skb);
2617 }
2618 tcp_rtx_queue_purge(sk);
2619 skb = sk->sk_tx_skb_cache;
2620 if (skb) {
2621 __kfree_skb(skb);
2622 sk->sk_tx_skb_cache = NULL;
2623 }
2624 INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
2625 sk_mem_reclaim(sk);
2626 tcp_clear_all_retrans_hints(tcp_sk(sk));
2627 tcp_sk(sk)->packets_out = 0;
2628 inet_csk(sk)->icsk_backoff = 0;
2629}
2630
2631int tcp_disconnect(struct sock *sk, int flags)
2632{
2633 struct inet_sock *inet = inet_sk(sk);
2634 struct inet_connection_sock *icsk = inet_csk(sk);
2635 struct tcp_sock *tp = tcp_sk(sk);
2636 int old_state = sk->sk_state;
2637 u32 seq;
2638
2639 if (old_state != TCP_CLOSE)
2640 tcp_set_state(sk, TCP_CLOSE);
2641
2642
2643 if (old_state == TCP_LISTEN) {
2644 inet_csk_listen_stop(sk);
2645 } else if (unlikely(tp->repair)) {
2646 sk->sk_err = ECONNABORTED;
2647 } else if (tcp_need_reset(old_state) ||
2648 (tp->snd_nxt != tp->write_seq &&
2649 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2650
2651
2652
2653 tcp_send_active_reset(sk, gfp_any());
2654 sk->sk_err = ECONNRESET;
2655 } else if (old_state == TCP_SYN_SENT)
2656 sk->sk_err = ECONNRESET;
2657
2658 tcp_clear_xmit_timers(sk);
2659 __skb_queue_purge(&sk->sk_receive_queue);
2660 if (sk->sk_rx_skb_cache) {
2661 __kfree_skb(sk->sk_rx_skb_cache);
2662 sk->sk_rx_skb_cache = NULL;
2663 }
2664 WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
2665 tp->urg_data = 0;
2666 tcp_write_queue_purge(sk);
2667 tcp_fastopen_active_disable_ofo_check(sk);
2668 skb_rbtree_purge(&tp->out_of_order_queue);
2669
2670 inet->inet_dport = 0;
2671
2672 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2673 inet_reset_saddr(sk);
2674
2675 sk->sk_shutdown = 0;
2676 sock_reset_flag(sk, SOCK_DONE);
2677 tp->srtt_us = 0;
2678 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
2679 tp->rcv_rtt_last_tsecr = 0;
2680
2681 seq = tp->write_seq + tp->max_window + 2;
2682 if (!seq)
2683 seq = 1;
2684 WRITE_ONCE(tp->write_seq, seq);
2685
2686 icsk->icsk_backoff = 0;
2687 icsk->icsk_probes_out = 0;
2688 icsk->icsk_rto = TCP_TIMEOUT_INIT;
2689 icsk->icsk_rto_min = TCP_RTO_MIN;
2690 icsk->icsk_delack_max = TCP_DELACK_MAX;
2691 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2692 tp->snd_cwnd = TCP_INIT_CWND;
2693 tp->snd_cwnd_cnt = 0;
2694 tp->window_clamp = 0;
2695 tp->delivered = 0;
2696 tp->delivered_ce = 0;
2697 if (icsk->icsk_ca_ops->release)
2698 icsk->icsk_ca_ops->release(sk);
2699 memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
2700 icsk->icsk_ca_initialized = 0;
2701 tcp_set_ca_state(sk, TCP_CA_Open);
2702 tp->is_sack_reneg = 0;
2703 tcp_clear_retrans(tp);
2704 tp->total_retrans = 0;
2705 inet_csk_delack_init(sk);
2706
2707
2708
2709 icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
2710 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2711 __sk_dst_reset(sk);
2712 dst_release(sk->sk_rx_dst);
2713 sk->sk_rx_dst = NULL;
2714 tcp_saved_syn_free(tp);
2715 tp->compressed_ack = 0;
2716 tp->segs_in = 0;
2717 tp->segs_out = 0;
2718 tp->bytes_sent = 0;
2719 tp->bytes_acked = 0;
2720 tp->bytes_received = 0;
2721 tp->bytes_retrans = 0;
2722 tp->data_segs_in = 0;
2723 tp->data_segs_out = 0;
2724 tp->duplicate_sack[0].start_seq = 0;
2725 tp->duplicate_sack[0].end_seq = 0;
2726 tp->dsack_dups = 0;
2727 tp->reord_seen = 0;
2728 tp->retrans_out = 0;
2729 tp->sacked_out = 0;
2730 tp->tlp_high_seq = 0;
2731 tp->last_oow_ack_time = 0;
2732
2733 tp->app_limited = ~0U;
2734 tp->rack.mstamp = 0;
2735 tp->rack.advanced = 0;
2736 tp->rack.reo_wnd_steps = 1;
2737 tp->rack.last_delivered = 0;
2738 tp->rack.reo_wnd_persist = 0;
2739 tp->rack.dsack_seen = 0;
2740 tp->syn_data_acked = 0;
2741 tp->rx_opt.saw_tstamp = 0;
2742 tp->rx_opt.dsack = 0;
2743 tp->rx_opt.num_sacks = 0;
2744 tp->rcv_ooopack = 0;
2745
2746
2747
2748 tcp_free_fastopen_req(tp);
2749 inet->defer_connect = 0;
2750 tp->fastopen_client_fail = 0;
2751
2752 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2753
2754 if (sk->sk_frag.page) {
2755 put_page(sk->sk_frag.page);
2756 sk->sk_frag.page = NULL;
2757 sk->sk_frag.offset = 0;
2758 }
2759
2760 sk->sk_error_report(sk);
2761 return 0;
2762}
2763EXPORT_SYMBOL(tcp_disconnect);
2764
2765static inline bool tcp_can_repair_sock(const struct sock *sk)
2766{
2767 return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
2768 (sk->sk_state != TCP_LISTEN);
2769}
2770
2771static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
2772{
2773 struct tcp_repair_window opt;
2774
2775 if (!tp->repair)
2776 return -EPERM;
2777
2778 if (len != sizeof(opt))
2779 return -EINVAL;
2780
2781 if (copy_from_sockptr(&opt, optbuf, sizeof(opt)))
2782 return -EFAULT;
2783
2784 if (opt.max_window < opt.snd_wnd)
2785 return -EINVAL;
2786
2787 if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
2788 return -EINVAL;
2789
2790 if (after(opt.rcv_wup, tp->rcv_nxt))
2791 return -EINVAL;
2792
2793 tp->snd_wl1 = opt.snd_wl1;
2794 tp->snd_wnd = opt.snd_wnd;
2795 tp->max_window = opt.max_window;
2796
2797 tp->rcv_wnd = opt.rcv_wnd;
2798 tp->rcv_wup = opt.rcv_wup;
2799
2800 return 0;
2801}
2802
2803static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
2804 unsigned int len)
2805{
2806 struct tcp_sock *tp = tcp_sk(sk);
2807 struct tcp_repair_opt opt;
2808 size_t offset = 0;
2809
2810 while (len >= sizeof(opt)) {
2811 if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt)))
2812 return -EFAULT;
2813
2814 offset += sizeof(opt);
2815 len -= sizeof(opt);
2816
2817 switch (opt.opt_code) {
2818 case TCPOPT_MSS:
2819 tp->rx_opt.mss_clamp = opt.opt_val;
2820 tcp_mtup_init(sk);
2821 break;
2822 case TCPOPT_WINDOW:
2823 {
2824 u16 snd_wscale = opt.opt_val & 0xFFFF;
2825 u16 rcv_wscale = opt.opt_val >> 16;
2826
2827 if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE)
2828 return -EFBIG;
2829
2830 tp->rx_opt.snd_wscale = snd_wscale;
2831 tp->rx_opt.rcv_wscale = rcv_wscale;
2832 tp->rx_opt.wscale_ok = 1;
2833 }
2834 break;
2835 case TCPOPT_SACK_PERM:
2836 if (opt.opt_val != 0)
2837 return -EINVAL;
2838
2839 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2840 break;
2841 case TCPOPT_TIMESTAMP:
2842 if (opt.opt_val != 0)
2843 return -EINVAL;
2844
2845 tp->rx_opt.tstamp_ok = 1;
2846 break;
2847 }
2848 }
2849
2850 return 0;
2851}
2852
2853DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
2854EXPORT_SYMBOL(tcp_tx_delay_enabled);
2855
2856static void tcp_enable_tx_delay(void)
2857{
2858 if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
2859 static int __tcp_tx_delay_enabled = 0;
2860
2861 if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
2862 static_branch_enable(&tcp_tx_delay_enabled);
2863 pr_info("TCP_TX_DELAY enabled\n");
2864 }
2865 }
2866}
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877static void __tcp_sock_set_cork(struct sock *sk, bool on)
2878{
2879 struct tcp_sock *tp = tcp_sk(sk);
2880
2881 if (on) {
2882 tp->nonagle |= TCP_NAGLE_CORK;
2883 } else {
2884 tp->nonagle &= ~TCP_NAGLE_CORK;
2885 if (tp->nonagle & TCP_NAGLE_OFF)
2886 tp->nonagle |= TCP_NAGLE_PUSH;
2887 tcp_push_pending_frames(sk);
2888 }
2889}
2890
2891void tcp_sock_set_cork(struct sock *sk, bool on)
2892{
2893 lock_sock(sk);
2894 __tcp_sock_set_cork(sk, on);
2895 release_sock(sk);
2896}
2897EXPORT_SYMBOL(tcp_sock_set_cork);
2898
2899
2900
2901
2902
2903
2904
2905static void __tcp_sock_set_nodelay(struct sock *sk, bool on)
2906{
2907 if (on) {
2908 tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2909 tcp_push_pending_frames(sk);
2910 } else {
2911 tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF;
2912 }
2913}
2914
2915void tcp_sock_set_nodelay(struct sock *sk)
2916{
2917 lock_sock(sk);
2918 __tcp_sock_set_nodelay(sk, true);
2919 release_sock(sk);
2920}
2921EXPORT_SYMBOL(tcp_sock_set_nodelay);
2922
2923static void __tcp_sock_set_quickack(struct sock *sk, int val)
2924{
2925 if (!val) {
2926 inet_csk_enter_pingpong_mode(sk);
2927 return;
2928 }
2929
2930 inet_csk_exit_pingpong_mode(sk);
2931 if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2932 inet_csk_ack_scheduled(sk)) {
2933 inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED;
2934 tcp_cleanup_rbuf(sk, 1);
2935 if (!(val & 1))
2936 inet_csk_enter_pingpong_mode(sk);
2937 }
2938}
2939
2940void tcp_sock_set_quickack(struct sock *sk, int val)
2941{
2942 lock_sock(sk);
2943 __tcp_sock_set_quickack(sk, val);
2944 release_sock(sk);
2945}
2946EXPORT_SYMBOL(tcp_sock_set_quickack);
2947
2948int tcp_sock_set_syncnt(struct sock *sk, int val)
2949{
2950 if (val < 1 || val > MAX_TCP_SYNCNT)
2951 return -EINVAL;
2952
2953 lock_sock(sk);
2954 inet_csk(sk)->icsk_syn_retries = val;
2955 release_sock(sk);
2956 return 0;
2957}
2958EXPORT_SYMBOL(tcp_sock_set_syncnt);
2959
2960void tcp_sock_set_user_timeout(struct sock *sk, u32 val)
2961{
2962 lock_sock(sk);
2963 inet_csk(sk)->icsk_user_timeout = val;
2964 release_sock(sk);
2965}
2966EXPORT_SYMBOL(tcp_sock_set_user_timeout);
2967
2968int tcp_sock_set_keepidle_locked(struct sock *sk, int val)
2969{
2970 struct tcp_sock *tp = tcp_sk(sk);
2971
2972 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2973 return -EINVAL;
2974
2975 tp->keepalive_time = val * HZ;
2976 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2977 !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
2978 u32 elapsed = keepalive_time_elapsed(tp);
2979
2980 if (tp->keepalive_time > elapsed)
2981 elapsed = tp->keepalive_time - elapsed;
2982 else
2983 elapsed = 0;
2984 inet_csk_reset_keepalive_timer(sk, elapsed);
2985 }
2986
2987 return 0;
2988}
2989
2990int tcp_sock_set_keepidle(struct sock *sk, int val)
2991{
2992 int err;
2993
2994 lock_sock(sk);
2995 err = tcp_sock_set_keepidle_locked(sk, val);
2996 release_sock(sk);
2997 return err;
2998}
2999EXPORT_SYMBOL(tcp_sock_set_keepidle);
3000
3001int tcp_sock_set_keepintvl(struct sock *sk, int val)
3002{
3003 if (val < 1 || val > MAX_TCP_KEEPINTVL)
3004 return -EINVAL;
3005
3006 lock_sock(sk);
3007 tcp_sk(sk)->keepalive_intvl = val * HZ;
3008 release_sock(sk);
3009 return 0;
3010}
3011EXPORT_SYMBOL(tcp_sock_set_keepintvl);
3012
3013int tcp_sock_set_keepcnt(struct sock *sk, int val)
3014{
3015 if (val < 1 || val > MAX_TCP_KEEPCNT)
3016 return -EINVAL;
3017
3018 lock_sock(sk);
3019 tcp_sk(sk)->keepalive_probes = val;
3020 release_sock(sk);
3021 return 0;
3022}
3023EXPORT_SYMBOL(tcp_sock_set_keepcnt);
3024
3025
3026
3027
3028static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
3029 sockptr_t optval, unsigned int optlen)
3030{
3031 struct tcp_sock *tp = tcp_sk(sk);
3032 struct inet_connection_sock *icsk = inet_csk(sk);
3033 struct net *net = sock_net(sk);
3034 int val;
3035 int err = 0;
3036
3037
3038 switch (optname) {
3039 case TCP_CONGESTION: {
3040 char name[TCP_CA_NAME_MAX];
3041
3042 if (optlen < 1)
3043 return -EINVAL;
3044
3045 val = strncpy_from_sockptr(name, optval,
3046 min_t(long, TCP_CA_NAME_MAX-1, optlen));
3047 if (val < 0)
3048 return -EFAULT;
3049 name[val] = 0;
3050
3051 lock_sock(sk);
3052 err = tcp_set_congestion_control(sk, name, true,
3053 ns_capable(sock_net(sk)->user_ns,
3054 CAP_NET_ADMIN));
3055 release_sock(sk);
3056 return err;
3057 }
3058 case TCP_ULP: {
3059 char name[TCP_ULP_NAME_MAX];
3060
3061 if (optlen < 1)
3062 return -EINVAL;
3063
3064 val = strncpy_from_sockptr(name, optval,
3065 min_t(long, TCP_ULP_NAME_MAX - 1,
3066 optlen));
3067 if (val < 0)
3068 return -EFAULT;
3069 name[val] = 0;
3070
3071 lock_sock(sk);
3072 err = tcp_set_ulp(sk, name);
3073 release_sock(sk);
3074 return err;
3075 }
3076 case TCP_FASTOPEN_KEY: {
3077 __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
3078 __u8 *backup_key = NULL;
3079
3080
3081
3082
3083 if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
3084 optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
3085 return -EINVAL;
3086
3087 if (copy_from_sockptr(key, optval, optlen))
3088 return -EFAULT;
3089
3090 if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
3091 backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
3092
3093 return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
3094 }
3095 default:
3096
3097 break;
3098 }
3099
3100 if (optlen < sizeof(int))
3101 return -EINVAL;
3102
3103 if (copy_from_sockptr(&val, optval, sizeof(val)))
3104 return -EFAULT;
3105
3106 lock_sock(sk);
3107
3108 switch (optname) {
3109 case TCP_MAXSEG:
3110
3111
3112
3113
3114 if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) {
3115 err = -EINVAL;
3116 break;
3117 }
3118 tp->rx_opt.user_mss = val;
3119 break;
3120
3121 case TCP_NODELAY:
3122 __tcp_sock_set_nodelay(sk, val);
3123 break;
3124
3125 case TCP_THIN_LINEAR_TIMEOUTS:
3126 if (val < 0 || val > 1)
3127 err = -EINVAL;
3128 else
3129 tp->thin_lto = val;
3130 break;
3131
3132 case TCP_THIN_DUPACK:
3133 if (val < 0 || val > 1)
3134 err = -EINVAL;
3135 break;
3136
3137 case TCP_REPAIR:
3138 if (!tcp_can_repair_sock(sk))
3139 err = -EPERM;
3140 else if (val == TCP_REPAIR_ON) {
3141 tp->repair = 1;
3142 sk->sk_reuse = SK_FORCE_REUSE;
3143 tp->repair_queue = TCP_NO_QUEUE;
3144 } else if (val == TCP_REPAIR_OFF) {
3145 tp->repair = 0;
3146 sk->sk_reuse = SK_NO_REUSE;
3147 tcp_send_window_probe(sk);
3148 } else if (val == TCP_REPAIR_OFF_NO_WP) {
3149 tp->repair = 0;
3150 sk->sk_reuse = SK_NO_REUSE;
3151 } else
3152 err = -EINVAL;
3153
3154 break;
3155
3156 case TCP_REPAIR_QUEUE:
3157 if (!tp->repair)
3158 err = -EPERM;
3159 else if ((unsigned int)val < TCP_QUEUES_NR)
3160 tp->repair_queue = val;
3161 else
3162 err = -EINVAL;
3163 break;
3164
3165 case TCP_QUEUE_SEQ:
3166 if (sk->sk_state != TCP_CLOSE)
3167 err = -EPERM;
3168 else if (tp->repair_queue == TCP_SEND_QUEUE)
3169 WRITE_ONCE(tp->write_seq, val);
3170 else if (tp->repair_queue == TCP_RECV_QUEUE) {
3171 WRITE_ONCE(tp->rcv_nxt, val);
3172 WRITE_ONCE(tp->copied_seq, val);
3173 }
3174 else
3175 err = -EINVAL;
3176 break;
3177
3178 case TCP_REPAIR_OPTIONS:
3179 if (!tp->repair)
3180 err = -EINVAL;
3181 else if (sk->sk_state == TCP_ESTABLISHED)
3182 err = tcp_repair_options_est(sk, optval, optlen);
3183 else
3184 err = -EPERM;
3185 break;
3186
3187 case TCP_CORK:
3188 __tcp_sock_set_cork(sk, val);
3189 break;
3190
3191 case TCP_KEEPIDLE:
3192 err = tcp_sock_set_keepidle_locked(sk, val);
3193 break;
3194 case TCP_KEEPINTVL:
3195 if (val < 1 || val > MAX_TCP_KEEPINTVL)
3196 err = -EINVAL;
3197 else
3198 tp->keepalive_intvl = val * HZ;
3199 break;
3200 case TCP_KEEPCNT:
3201 if (val < 1 || val > MAX_TCP_KEEPCNT)
3202 err = -EINVAL;
3203 else
3204 tp->keepalive_probes = val;
3205 break;
3206 case TCP_SYNCNT:
3207 if (val < 1 || val > MAX_TCP_SYNCNT)
3208 err = -EINVAL;
3209 else
3210 icsk->icsk_syn_retries = val;
3211 break;
3212
3213 case TCP_SAVE_SYN:
3214
3215 if (val < 0 || val > 2)
3216 err = -EINVAL;
3217 else
3218 tp->save_syn = val;
3219 break;
3220
3221 case TCP_LINGER2:
3222 if (val < 0)
3223 tp->linger2 = -1;
3224 else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
3225 tp->linger2 = TCP_FIN_TIMEOUT_MAX;
3226 else
3227 tp->linger2 = val * HZ;
3228 break;
3229
3230 case TCP_DEFER_ACCEPT:
3231
3232 icsk->icsk_accept_queue.rskq_defer_accept =
3233 secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
3234 TCP_RTO_MAX / HZ);
3235 break;
3236
3237 case TCP_WINDOW_CLAMP:
3238 if (!val) {
3239 if (sk->sk_state != TCP_CLOSE) {
3240 err = -EINVAL;
3241 break;
3242 }
3243 tp->window_clamp = 0;
3244 } else
3245 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
3246 SOCK_MIN_RCVBUF / 2 : val;
3247 break;
3248
3249 case TCP_QUICKACK:
3250 __tcp_sock_set_quickack(sk, val);
3251 break;
3252
3253#ifdef CONFIG_TCP_MD5SIG
3254 case TCP_MD5SIG:
3255 case TCP_MD5SIG_EXT:
3256 err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
3257 break;
3258#endif
3259 case TCP_USER_TIMEOUT:
3260
3261
3262
3263 if (val < 0)
3264 err = -EINVAL;
3265 else
3266 icsk->icsk_user_timeout = val;
3267 break;
3268
3269 case TCP_FASTOPEN:
3270 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
3271 TCPF_LISTEN))) {
3272 tcp_fastopen_init_key_once(net);
3273
3274 fastopen_queue_tune(sk, val);
3275 } else {
3276 err = -EINVAL;
3277 }
3278 break;
3279 case TCP_FASTOPEN_CONNECT:
3280 if (val > 1 || val < 0) {
3281 err = -EINVAL;
3282 } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
3283 if (sk->sk_state == TCP_CLOSE)
3284 tp->fastopen_connect = val;
3285 else
3286 err = -EINVAL;
3287 } else {
3288 err = -EOPNOTSUPP;
3289 }
3290 break;
3291 case TCP_FASTOPEN_NO_COOKIE:
3292 if (val > 1 || val < 0)
3293 err = -EINVAL;
3294 else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3295 err = -EINVAL;
3296 else
3297 tp->fastopen_no_cookie = val;
3298 break;
3299 case TCP_TIMESTAMP:
3300 if (!tp->repair)
3301 err = -EPERM;
3302 else
3303 tp->tsoffset = val - tcp_time_stamp_raw();
3304 break;
3305 case TCP_REPAIR_WINDOW:
3306 err = tcp_repair_set_window(tp, optval, optlen);
3307 break;
3308 case TCP_NOTSENT_LOWAT:
3309 tp->notsent_lowat = val;
3310 sk->sk_write_space(sk);
3311 break;
3312 case TCP_INQ:
3313 if (val > 1 || val < 0)
3314 err = -EINVAL;
3315 else
3316 tp->recvmsg_inq = val;
3317 break;
3318 case TCP_TX_DELAY:
3319 if (val)
3320 tcp_enable_tx_delay();
3321 tp->tcp_tx_delay = val;
3322 break;
3323 default:
3324 err = -ENOPROTOOPT;
3325 break;
3326 }
3327
3328 release_sock(sk);
3329 return err;
3330}
3331
3332int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
3333 unsigned int optlen)
3334{
3335 const struct inet_connection_sock *icsk = inet_csk(sk);
3336
3337 if (level != SOL_TCP)
3338 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
3339 optval, optlen);
3340 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
3341}
3342EXPORT_SYMBOL(tcp_setsockopt);
3343
3344static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
3345 struct tcp_info *info)
3346{
3347 u64 stats[__TCP_CHRONO_MAX], total = 0;
3348 enum tcp_chrono i;
3349
3350 for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
3351 stats[i] = tp->chrono_stat[i - 1];
3352 if (i == tp->chrono_type)
3353 stats[i] += tcp_jiffies32 - tp->chrono_start;
3354 stats[i] *= USEC_PER_SEC / HZ;
3355 total += stats[i];
3356 }
3357
3358 info->tcpi_busy_time = total;
3359 info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
3360 info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
3361}
3362
3363
3364void tcp_get_info(struct sock *sk, struct tcp_info *info)
3365{
3366 const struct tcp_sock *tp = tcp_sk(sk);
3367 const struct inet_connection_sock *icsk = inet_csk(sk);
3368 unsigned long rate;
3369 u32 now;
3370 u64 rate64;
3371 bool slow;
3372
3373 memset(info, 0, sizeof(*info));
3374 if (sk->sk_type != SOCK_STREAM)
3375 return;
3376
3377 info->tcpi_state = inet_sk_state_load(sk);
3378
3379
3380 rate = READ_ONCE(sk->sk_pacing_rate);
3381 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3382 info->tcpi_pacing_rate = rate64;
3383
3384 rate = READ_ONCE(sk->sk_max_pacing_rate);
3385 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3386 info->tcpi_max_pacing_rate = rate64;
3387
3388 info->tcpi_reordering = tp->reordering;
3389 info->tcpi_snd_cwnd = tp->snd_cwnd;
3390
3391 if (info->tcpi_state == TCP_LISTEN) {
3392
3393
3394
3395
3396 info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
3397 info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog);
3398 return;
3399 }
3400
3401 slow = lock_sock_fast(sk);
3402
3403 info->tcpi_ca_state = icsk->icsk_ca_state;
3404 info->tcpi_retransmits = icsk->icsk_retransmits;
3405 info->tcpi_probes = icsk->icsk_probes_out;
3406 info->tcpi_backoff = icsk->icsk_backoff;
3407
3408 if (tp->rx_opt.tstamp_ok)
3409 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
3410 if (tcp_is_sack(tp))
3411 info->tcpi_options |= TCPI_OPT_SACK;
3412 if (tp->rx_opt.wscale_ok) {
3413 info->tcpi_options |= TCPI_OPT_WSCALE;
3414 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
3415 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
3416 }
3417
3418 if (tp->ecn_flags & TCP_ECN_OK)
3419 info->tcpi_options |= TCPI_OPT_ECN;
3420 if (tp->ecn_flags & TCP_ECN_SEEN)
3421 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
3422 if (tp->syn_data_acked)
3423 info->tcpi_options |= TCPI_OPT_SYN_DATA;
3424
3425 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
3426 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
3427 info->tcpi_snd_mss = tp->mss_cache;
3428 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
3429
3430 info->tcpi_unacked = tp->packets_out;
3431 info->tcpi_sacked = tp->sacked_out;
3432
3433 info->tcpi_lost = tp->lost_out;
3434 info->tcpi_retrans = tp->retrans_out;
3435
3436 now = tcp_jiffies32;
3437 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
3438 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
3439 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
3440
3441 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
3442 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
3443 info->tcpi_rtt = tp->srtt_us >> 3;
3444 info->tcpi_rttvar = tp->mdev_us >> 2;
3445 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
3446 info->tcpi_advmss = tp->advmss;
3447
3448 info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
3449 info->tcpi_rcv_space = tp->rcvq_space.space;
3450
3451 info->tcpi_total_retrans = tp->total_retrans;
3452
3453 info->tcpi_bytes_acked = tp->bytes_acked;
3454 info->tcpi_bytes_received = tp->bytes_received;
3455 info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
3456 tcp_get_info_chrono_stats(tp, info);
3457
3458 info->tcpi_segs_out = tp->segs_out;
3459 info->tcpi_segs_in = tp->segs_in;
3460
3461 info->tcpi_min_rtt = tcp_min_rtt(tp);
3462 info->tcpi_data_segs_in = tp->data_segs_in;
3463 info->tcpi_data_segs_out = tp->data_segs_out;
3464
3465 info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
3466 rate64 = tcp_compute_delivery_rate(tp);
3467 if (rate64)
3468 info->tcpi_delivery_rate = rate64;
3469 info->tcpi_delivered = tp->delivered;
3470 info->tcpi_delivered_ce = tp->delivered_ce;
3471 info->tcpi_bytes_sent = tp->bytes_sent;
3472 info->tcpi_bytes_retrans = tp->bytes_retrans;
3473 info->tcpi_dsack_dups = tp->dsack_dups;
3474 info->tcpi_reord_seen = tp->reord_seen;
3475 info->tcpi_rcv_ooopack = tp->rcv_ooopack;
3476 info->tcpi_snd_wnd = tp->snd_wnd;
3477 info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
3478 unlock_sock_fast(sk, slow);
3479}
3480EXPORT_SYMBOL_GPL(tcp_get_info);
3481
3482static size_t tcp_opt_stats_get_size(void)
3483{
3484 return
3485 nla_total_size_64bit(sizeof(u64)) +
3486 nla_total_size_64bit(sizeof(u64)) +
3487 nla_total_size_64bit(sizeof(u64)) +
3488 nla_total_size_64bit(sizeof(u64)) +
3489 nla_total_size_64bit(sizeof(u64)) +
3490 nla_total_size_64bit(sizeof(u64)) +
3491 nla_total_size_64bit(sizeof(u64)) +
3492 nla_total_size(sizeof(u32)) +
3493 nla_total_size(sizeof(u32)) +
3494 nla_total_size(sizeof(u32)) +
3495 nla_total_size(sizeof(u8)) +
3496 nla_total_size(sizeof(u8)) +
3497 nla_total_size(sizeof(u32)) +
3498 nla_total_size(sizeof(u8)) +
3499 nla_total_size(sizeof(u32)) +
3500 nla_total_size(sizeof(u32)) +
3501 nla_total_size(sizeof(u32)) +
3502 nla_total_size_64bit(sizeof(u64)) +
3503 nla_total_size_64bit(sizeof(u64)) +
3504 nla_total_size(sizeof(u32)) +
3505 nla_total_size(sizeof(u32)) +
3506 nla_total_size(sizeof(u32)) +
3507 nla_total_size(sizeof(u16)) +
3508 nla_total_size(sizeof(u32)) +
3509 nla_total_size_64bit(sizeof(u64)) +
3510 0;
3511}
3512
3513struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
3514 const struct sk_buff *orig_skb)
3515{
3516 const struct tcp_sock *tp = tcp_sk(sk);
3517 struct sk_buff *stats;
3518 struct tcp_info info;
3519 unsigned long rate;
3520 u64 rate64;
3521
3522 stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
3523 if (!stats)
3524 return NULL;
3525
3526 tcp_get_info_chrono_stats(tp, &info);
3527 nla_put_u64_64bit(stats, TCP_NLA_BUSY,
3528 info.tcpi_busy_time, TCP_NLA_PAD);
3529 nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
3530 info.tcpi_rwnd_limited, TCP_NLA_PAD);
3531 nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
3532 info.tcpi_sndbuf_limited, TCP_NLA_PAD);
3533 nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
3534 tp->data_segs_out, TCP_NLA_PAD);
3535 nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
3536 tp->total_retrans, TCP_NLA_PAD);
3537
3538 rate = READ_ONCE(sk->sk_pacing_rate);
3539 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3540 nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
3541
3542 rate64 = tcp_compute_delivery_rate(tp);
3543 nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
3544
3545 nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
3546 nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
3547 nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
3548
3549 nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
3550 nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
3551 nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
3552 nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
3553 nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
3554
3555 nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
3556 nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
3557
3558 nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
3559 TCP_NLA_PAD);
3560 nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
3561 TCP_NLA_PAD);
3562 nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
3563 nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
3564 nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
3565 nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
3566 nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
3567 max_t(int, 0, tp->write_seq - tp->snd_nxt));
3568 nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
3569 TCP_NLA_PAD);
3570
3571 return stats;
3572}
3573
3574static int do_tcp_getsockopt(struct sock *sk, int level,
3575 int optname, char __user *optval, int __user *optlen)
3576{
3577 struct inet_connection_sock *icsk = inet_csk(sk);
3578 struct tcp_sock *tp = tcp_sk(sk);
3579 struct net *net = sock_net(sk);
3580 int val, len;
3581
3582 if (get_user(len, optlen))
3583 return -EFAULT;
3584
3585 len = min_t(unsigned int, len, sizeof(int));
3586
3587 if (len < 0)
3588 return -EINVAL;
3589
3590 switch (optname) {
3591 case TCP_MAXSEG:
3592 val = tp->mss_cache;
3593 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3594 val = tp->rx_opt.user_mss;
3595 if (tp->repair)
3596 val = tp->rx_opt.mss_clamp;
3597 break;
3598 case TCP_NODELAY:
3599 val = !!(tp->nonagle&TCP_NAGLE_OFF);
3600 break;
3601 case TCP_CORK:
3602 val = !!(tp->nonagle&TCP_NAGLE_CORK);
3603 break;
3604 case TCP_KEEPIDLE:
3605 val = keepalive_time_when(tp) / HZ;
3606 break;
3607 case TCP_KEEPINTVL:
3608 val = keepalive_intvl_when(tp) / HZ;
3609 break;
3610 case TCP_KEEPCNT:
3611 val = keepalive_probes(tp);
3612 break;
3613 case TCP_SYNCNT:
3614 val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
3615 break;
3616 case TCP_LINGER2:
3617 val = tp->linger2;
3618 if (val >= 0)
3619 val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
3620 break;
3621 case TCP_DEFER_ACCEPT:
3622 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
3623 TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
3624 break;
3625 case TCP_WINDOW_CLAMP:
3626 val = tp->window_clamp;
3627 break;
3628 case TCP_INFO: {
3629 struct tcp_info info;
3630
3631 if (get_user(len, optlen))
3632 return -EFAULT;
3633
3634 tcp_get_info(sk, &info);
3635
3636 len = min_t(unsigned int, len, sizeof(info));
3637 if (put_user(len, optlen))
3638 return -EFAULT;
3639 if (copy_to_user(optval, &info, len))
3640 return -EFAULT;
3641 return 0;
3642 }
3643 case TCP_CC_INFO: {
3644 const struct tcp_congestion_ops *ca_ops;
3645 union tcp_cc_info info;
3646 size_t sz = 0;
3647 int attr;
3648
3649 if (get_user(len, optlen))
3650 return -EFAULT;
3651
3652 ca_ops = icsk->icsk_ca_ops;
3653 if (ca_ops && ca_ops->get_info)
3654 sz = ca_ops->get_info(sk, ~0U, &attr, &info);
3655
3656 len = min_t(unsigned int, len, sz);
3657 if (put_user(len, optlen))
3658 return -EFAULT;
3659 if (copy_to_user(optval, &info, len))
3660 return -EFAULT;
3661 return 0;
3662 }
3663 case TCP_QUICKACK:
3664 val = !inet_csk_in_pingpong_mode(sk);
3665 break;
3666
3667 case TCP_CONGESTION:
3668 if (get_user(len, optlen))
3669 return -EFAULT;
3670 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
3671 if (put_user(len, optlen))
3672 return -EFAULT;
3673 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
3674 return -EFAULT;
3675 return 0;
3676
3677 case TCP_ULP:
3678 if (get_user(len, optlen))
3679 return -EFAULT;
3680 len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
3681 if (!icsk->icsk_ulp_ops) {
3682 if (put_user(0, optlen))
3683 return -EFAULT;
3684 return 0;
3685 }
3686 if (put_user(len, optlen))
3687 return -EFAULT;
3688 if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
3689 return -EFAULT;
3690 return 0;
3691
3692 case TCP_FASTOPEN_KEY: {
3693 u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)];
3694 unsigned int key_len;
3695
3696 if (get_user(len, optlen))
3697 return -EFAULT;
3698
3699 key_len = tcp_fastopen_get_cipher(net, icsk, key) *
3700 TCP_FASTOPEN_KEY_LENGTH;
3701 len = min_t(unsigned int, len, key_len);
3702 if (put_user(len, optlen))
3703 return -EFAULT;
3704 if (copy_to_user(optval, key, len))
3705 return -EFAULT;
3706 return 0;
3707 }
3708 case TCP_THIN_LINEAR_TIMEOUTS:
3709 val = tp->thin_lto;
3710 break;
3711
3712 case TCP_THIN_DUPACK:
3713 val = 0;
3714 break;
3715
3716 case TCP_REPAIR:
3717 val = tp->repair;
3718 break;
3719
3720 case TCP_REPAIR_QUEUE:
3721 if (tp->repair)
3722 val = tp->repair_queue;
3723 else
3724 return -EINVAL;
3725 break;
3726
3727 case TCP_REPAIR_WINDOW: {
3728 struct tcp_repair_window opt;
3729
3730 if (get_user(len, optlen))
3731 return -EFAULT;
3732
3733 if (len != sizeof(opt))
3734 return -EINVAL;
3735
3736 if (!tp->repair)
3737 return -EPERM;
3738
3739 opt.snd_wl1 = tp->snd_wl1;
3740 opt.snd_wnd = tp->snd_wnd;
3741 opt.max_window = tp->max_window;
3742 opt.rcv_wnd = tp->rcv_wnd;
3743 opt.rcv_wup = tp->rcv_wup;
3744
3745 if (copy_to_user(optval, &opt, len))
3746 return -EFAULT;
3747 return 0;
3748 }
3749 case TCP_QUEUE_SEQ:
3750 if (tp->repair_queue == TCP_SEND_QUEUE)
3751 val = tp->write_seq;
3752 else if (tp->repair_queue == TCP_RECV_QUEUE)
3753 val = tp->rcv_nxt;
3754 else
3755 return -EINVAL;
3756 break;
3757
3758 case TCP_USER_TIMEOUT:
3759 val = icsk->icsk_user_timeout;
3760 break;
3761
3762 case TCP_FASTOPEN:
3763 val = icsk->icsk_accept_queue.fastopenq.max_qlen;
3764 break;
3765
3766 case TCP_FASTOPEN_CONNECT:
3767 val = tp->fastopen_connect;
3768 break;
3769
3770 case TCP_FASTOPEN_NO_COOKIE:
3771 val = tp->fastopen_no_cookie;
3772 break;
3773
3774 case TCP_TX_DELAY:
3775 val = tp->tcp_tx_delay;
3776 break;
3777
3778 case TCP_TIMESTAMP:
3779 val = tcp_time_stamp_raw() + tp->tsoffset;
3780 break;
3781 case TCP_NOTSENT_LOWAT:
3782 val = tp->notsent_lowat;
3783 break;
3784 case TCP_INQ:
3785 val = tp->recvmsg_inq;
3786 break;
3787 case TCP_SAVE_SYN:
3788 val = tp->save_syn;
3789 break;
3790 case TCP_SAVED_SYN: {
3791 if (get_user(len, optlen))
3792 return -EFAULT;
3793
3794 lock_sock(sk);
3795 if (tp->saved_syn) {
3796 if (len < tcp_saved_syn_len(tp->saved_syn)) {
3797 if (put_user(tcp_saved_syn_len(tp->saved_syn),
3798 optlen)) {
3799 release_sock(sk);
3800 return -EFAULT;
3801 }
3802 release_sock(sk);
3803 return -EINVAL;
3804 }
3805 len = tcp_saved_syn_len(tp->saved_syn);
3806 if (put_user(len, optlen)) {
3807 release_sock(sk);
3808 return -EFAULT;
3809 }
3810 if (copy_to_user(optval, tp->saved_syn->data, len)) {
3811 release_sock(sk);
3812 return -EFAULT;
3813 }
3814 tcp_saved_syn_free(tp);
3815 release_sock(sk);
3816 } else {
3817 release_sock(sk);
3818 len = 0;
3819 if (put_user(len, optlen))
3820 return -EFAULT;
3821 }
3822 return 0;
3823 }
3824#ifdef CONFIG_MMU
3825 case TCP_ZEROCOPY_RECEIVE: {
3826 struct tcp_zerocopy_receive zc;
3827 int err;
3828
3829 if (get_user(len, optlen))
3830 return -EFAULT;
3831 if (len < offsetofend(struct tcp_zerocopy_receive, length))
3832 return -EINVAL;
3833 if (len > sizeof(zc)) {
3834 len = sizeof(zc);
3835 if (put_user(len, optlen))
3836 return -EFAULT;
3837 }
3838 if (copy_from_user(&zc, optval, len))
3839 return -EFAULT;
3840 lock_sock(sk);
3841 err = tcp_zerocopy_receive(sk, &zc);
3842 release_sock(sk);
3843 if (len == sizeof(zc))
3844 goto zerocopy_rcv_sk_err;
3845 switch (len) {
3846 case offsetofend(struct tcp_zerocopy_receive, err):
3847 goto zerocopy_rcv_sk_err;
3848 case offsetofend(struct tcp_zerocopy_receive, inq):
3849 goto zerocopy_rcv_inq;
3850 case offsetofend(struct tcp_zerocopy_receive, length):
3851 default:
3852 goto zerocopy_rcv_out;
3853 }
3854zerocopy_rcv_sk_err:
3855 if (!err)
3856 zc.err = sock_error(sk);
3857zerocopy_rcv_inq:
3858 zc.inq = tcp_inq_hint(sk);
3859zerocopy_rcv_out:
3860 if (!err && copy_to_user(optval, &zc, len))
3861 err = -EFAULT;
3862 return err;
3863 }
3864#endif
3865 default:
3866 return -ENOPROTOOPT;
3867 }
3868
3869 if (put_user(len, optlen))
3870 return -EFAULT;
3871 if (copy_to_user(optval, &val, len))
3872 return -EFAULT;
3873 return 0;
3874}
3875
3876int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
3877 int __user *optlen)
3878{
3879 struct inet_connection_sock *icsk = inet_csk(sk);
3880
3881 if (level != SOL_TCP)
3882 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
3883 optval, optlen);
3884 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
3885}
3886EXPORT_SYMBOL(tcp_getsockopt);
3887
3888#ifdef CONFIG_TCP_MD5SIG
3889static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
3890static DEFINE_MUTEX(tcp_md5sig_mutex);
3891static bool tcp_md5sig_pool_populated = false;
3892
3893static void __tcp_alloc_md5sig_pool(void)
3894{
3895 struct crypto_ahash *hash;
3896 int cpu;
3897
3898 hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
3899 if (IS_ERR(hash))
3900 return;
3901
3902 for_each_possible_cpu(cpu) {
3903 void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
3904 struct ahash_request *req;
3905
3906 if (!scratch) {
3907 scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
3908 sizeof(struct tcphdr),
3909 GFP_KERNEL,
3910 cpu_to_node(cpu));
3911 if (!scratch)
3912 return;
3913 per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
3914 }
3915 if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
3916 continue;
3917
3918 req = ahash_request_alloc(hash, GFP_KERNEL);
3919 if (!req)
3920 return;
3921
3922 ahash_request_set_callback(req, 0, NULL, NULL);
3923
3924 per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
3925 }
3926
3927
3928
3929 smp_wmb();
3930 tcp_md5sig_pool_populated = true;
3931}
3932
3933bool tcp_alloc_md5sig_pool(void)
3934{
3935 if (unlikely(!tcp_md5sig_pool_populated)) {
3936 mutex_lock(&tcp_md5sig_mutex);
3937
3938 if (!tcp_md5sig_pool_populated) {
3939 __tcp_alloc_md5sig_pool();
3940 if (tcp_md5sig_pool_populated)
3941 static_branch_inc(&tcp_md5_needed);
3942 }
3943
3944 mutex_unlock(&tcp_md5sig_mutex);
3945 }
3946 return tcp_md5sig_pool_populated;
3947}
3948EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
3959{
3960 local_bh_disable();
3961
3962 if (tcp_md5sig_pool_populated) {
3963
3964 smp_rmb();
3965 return this_cpu_ptr(&tcp_md5sig_pool);
3966 }
3967 local_bh_enable();
3968 return NULL;
3969}
3970EXPORT_SYMBOL(tcp_get_md5sig_pool);
3971
3972int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3973 const struct sk_buff *skb, unsigned int header_len)
3974{
3975 struct scatterlist sg;
3976 const struct tcphdr *tp = tcp_hdr(skb);
3977 struct ahash_request *req = hp->md5_req;
3978 unsigned int i;
3979 const unsigned int head_data_len = skb_headlen(skb) > header_len ?
3980 skb_headlen(skb) - header_len : 0;
3981 const struct skb_shared_info *shi = skb_shinfo(skb);
3982 struct sk_buff *frag_iter;
3983
3984 sg_init_table(&sg, 1);
3985
3986 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
3987 ahash_request_set_crypt(req, &sg, NULL, head_data_len);
3988 if (crypto_ahash_update(req))
3989 return 1;
3990
3991 for (i = 0; i < shi->nr_frags; ++i) {
3992 const skb_frag_t *f = &shi->frags[i];
3993 unsigned int offset = skb_frag_off(f);
3994 struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
3995
3996 sg_set_page(&sg, page, skb_frag_size(f),
3997 offset_in_page(offset));
3998 ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
3999 if (crypto_ahash_update(req))
4000 return 1;
4001 }
4002
4003 skb_walk_frags(skb, frag_iter)
4004 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
4005 return 1;
4006
4007 return 0;
4008}
4009EXPORT_SYMBOL(tcp_md5_hash_skb_data);
4010
4011int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
4012{
4013 u8 keylen = READ_ONCE(key->keylen);
4014 struct scatterlist sg;
4015
4016 sg_init_one(&sg, key->key, keylen);
4017 ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen);
4018
4019
4020 return data_race(crypto_ahash_update(hp->md5_req));
4021}
4022EXPORT_SYMBOL(tcp_md5_hash_key);
4023
4024#endif
4025
4026void tcp_done(struct sock *sk)
4027{
4028 struct request_sock *req;
4029
4030
4031
4032
4033
4034 req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);
4035
4036 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
4037 TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
4038
4039 tcp_set_state(sk, TCP_CLOSE);
4040 tcp_clear_xmit_timers(sk);
4041 if (req)
4042 reqsk_fastopen_remove(sk, req, false);
4043
4044 sk->sk_shutdown = SHUTDOWN_MASK;
4045
4046 if (!sock_flag(sk, SOCK_DEAD))
4047 sk->sk_state_change(sk);
4048 else
4049 inet_csk_destroy_sock(sk);
4050}
4051EXPORT_SYMBOL_GPL(tcp_done);
4052
4053int tcp_abort(struct sock *sk, int err)
4054{
4055 if (!sk_fullsock(sk)) {
4056 if (sk->sk_state == TCP_NEW_SYN_RECV) {
4057 struct request_sock *req = inet_reqsk(sk);
4058
4059 local_bh_disable();
4060 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
4061 local_bh_enable();
4062 return 0;
4063 }
4064 return -EOPNOTSUPP;
4065 }
4066
4067
4068 lock_sock(sk);
4069
4070 if (sk->sk_state == TCP_LISTEN) {
4071 tcp_set_state(sk, TCP_CLOSE);
4072 inet_csk_listen_stop(sk);
4073 }
4074
4075
4076 local_bh_disable();
4077 bh_lock_sock(sk);
4078
4079 if (!sock_flag(sk, SOCK_DEAD)) {
4080 sk->sk_err = err;
4081
4082 smp_wmb();
4083 sk->sk_error_report(sk);
4084 if (tcp_need_reset(sk->sk_state))
4085 tcp_send_active_reset(sk, GFP_ATOMIC);
4086 tcp_done(sk);
4087 }
4088
4089 bh_unlock_sock(sk);
4090 local_bh_enable();
4091 tcp_write_queue_purge(sk);
4092 release_sock(sk);
4093 return 0;
4094}
4095EXPORT_SYMBOL_GPL(tcp_abort);
4096
4097extern struct tcp_congestion_ops tcp_reno;
4098
4099static __initdata unsigned long thash_entries;
4100static int __init set_thash_entries(char *str)
4101{
4102 ssize_t ret;
4103
4104 if (!str)
4105 return 0;
4106
4107 ret = kstrtoul(str, 0, &thash_entries);
4108 if (ret)
4109 return 0;
4110
4111 return 1;
4112}
4113__setup("thash_entries=", set_thash_entries);
4114
4115static void __init tcp_init_mem(void)
4116{
4117 unsigned long limit = nr_free_buffer_pages() / 16;
4118
4119 limit = max(limit, 128UL);
4120 sysctl_tcp_mem[0] = limit / 4 * 3;
4121 sysctl_tcp_mem[1] = limit;
4122 sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
4123}
4124
4125void __init tcp_init(void)
4126{
4127 int max_rshare, max_wshare, cnt;
4128 unsigned long limit;
4129 unsigned int i;
4130
4131 BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
4132 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
4133 sizeof_field(struct sk_buff, cb));
4134
4135 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
4136 percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
4137 inet_hashinfo_init(&tcp_hashinfo);
4138 inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
4139 thash_entries, 21,
4140 0, 64 * 1024);
4141 tcp_hashinfo.bind_bucket_cachep =
4142 kmem_cache_create("tcp_bind_bucket",
4143 sizeof(struct inet_bind_bucket), 0,
4144 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
4145
4146
4147
4148
4149
4150
4151 tcp_hashinfo.ehash =
4152 alloc_large_system_hash("TCP established",
4153 sizeof(struct inet_ehash_bucket),
4154 thash_entries,
4155 17,
4156 0,
4157 NULL,
4158 &tcp_hashinfo.ehash_mask,
4159 0,
4160 thash_entries ? 0 : 512 * 1024);
4161 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
4162 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
4163
4164 if (inet_ehash_locks_alloc(&tcp_hashinfo))
4165 panic("TCP: failed to alloc ehash_locks");
4166 tcp_hashinfo.bhash =
4167 alloc_large_system_hash("TCP bind",
4168 sizeof(struct inet_bind_hashbucket),
4169 tcp_hashinfo.ehash_mask + 1,
4170 17,
4171 0,
4172 &tcp_hashinfo.bhash_size,
4173 NULL,
4174 0,
4175 64 * 1024);
4176 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
4177 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
4178 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
4179 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
4180 }
4181
4182
4183 cnt = tcp_hashinfo.ehash_mask + 1;
4184 sysctl_tcp_max_orphans = cnt / 2;
4185
4186 tcp_init_mem();
4187
4188 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
4189 max_wshare = min(4UL*1024*1024, limit);
4190 max_rshare = min(6UL*1024*1024, limit);
4191
4192 init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
4193 init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
4194 init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
4195
4196 init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
4197 init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
4198 init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
4199
4200 pr_info("Hash tables configured (established %u bind %u)\n",
4201 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
4202
4203 tcp_v4_init();
4204 tcp_metrics_init();
4205 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
4206 tcp_tasklet_init();
4207 mptcp_init();
4208}
4209