1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244#define pr_fmt(fmt) "TCP: " fmt
245
246#include <crypto/hash.h>
247#include <linux/kernel.h>
248#include <linux/module.h>
249#include <linux/types.h>
250#include <linux/fcntl.h>
251#include <linux/poll.h>
252#include <linux/inet_diag.h>
253#include <linux/init.h>
254#include <linux/fs.h>
255#include <linux/skbuff.h>
256#include <linux/scatterlist.h>
257#include <linux/splice.h>
258#include <linux/net.h>
259#include <linux/socket.h>
260#include <linux/random.h>
261#include <linux/memblock.h>
262#include <linux/highmem.h>
263#include <linux/swap.h>
264#include <linux/cache.h>
265#include <linux/err.h>
266#include <linux/time.h>
267#include <linux/slab.h>
268#include <linux/errqueue.h>
269#include <linux/static_key.h>
270#include <linux/btf.h>
271
272#include <net/icmp.h>
273#include <net/inet_common.h>
274#include <net/tcp.h>
275#include <net/mptcp.h>
276#include <net/xfrm.h>
277#include <net/ip.h>
278#include <net/sock.h>
279
280#include <linux/uaccess.h>
281#include <asm/ioctls.h>
282#include <net/busy_poll.h>
283
284
285enum {
286 TCP_CMSG_INQ = 1,
287 TCP_CMSG_TS = 2
288};
289
290struct percpu_counter tcp_orphan_count;
291EXPORT_SYMBOL_GPL(tcp_orphan_count);
292
293long sysctl_tcp_mem[3] __read_mostly;
294EXPORT_SYMBOL(sysctl_tcp_mem);
295
296atomic_long_t tcp_memory_allocated;
297EXPORT_SYMBOL(tcp_memory_allocated);
298
299#if IS_ENABLED(CONFIG_SMC)
300DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
301EXPORT_SYMBOL(tcp_have_smc);
302#endif
303
304
305
306
307struct percpu_counter tcp_sockets_allocated;
308EXPORT_SYMBOL(tcp_sockets_allocated);
309
310
311
312
313struct tcp_splice_state {
314 struct pipe_inode_info *pipe;
315 size_t len;
316 unsigned int flags;
317};
318
319
320
321
322
323
324
325unsigned long tcp_memory_pressure __read_mostly;
326EXPORT_SYMBOL_GPL(tcp_memory_pressure);
327
328DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
329EXPORT_SYMBOL(tcp_rx_skb_cache_key);
330
331DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
332
333void tcp_enter_memory_pressure(struct sock *sk)
334{
335 unsigned long val;
336
337 if (READ_ONCE(tcp_memory_pressure))
338 return;
339 val = jiffies;
340
341 if (!val)
342 val--;
343 if (!cmpxchg(&tcp_memory_pressure, 0, val))
344 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
345}
346EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
347
348void tcp_leave_memory_pressure(struct sock *sk)
349{
350 unsigned long val;
351
352 if (!READ_ONCE(tcp_memory_pressure))
353 return;
354 val = xchg(&tcp_memory_pressure, 0);
355 if (val)
356 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
357 jiffies_to_msecs(jiffies - val));
358}
359EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
360
361
362static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
363{
364 u8 res = 0;
365
366 if (seconds > 0) {
367 int period = timeout;
368
369 res = 1;
370 while (seconds > period && res < 255) {
371 res++;
372 timeout <<= 1;
373 if (timeout > rto_max)
374 timeout = rto_max;
375 period += timeout;
376 }
377 }
378 return res;
379}
380
381
382static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
383{
384 int period = 0;
385
386 if (retrans > 0) {
387 period = timeout;
388 while (--retrans) {
389 timeout <<= 1;
390 if (timeout > rto_max)
391 timeout = rto_max;
392 period += timeout;
393 }
394 }
395 return period;
396}
397
398static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
399{
400 u32 rate = READ_ONCE(tp->rate_delivered);
401 u32 intv = READ_ONCE(tp->rate_interval_us);
402 u64 rate64 = 0;
403
404 if (rate && intv) {
405 rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
406 do_div(rate64, intv);
407 }
408 return rate64;
409}
410
411
412
413
414
415
416void tcp_init_sock(struct sock *sk)
417{
418 struct inet_connection_sock *icsk = inet_csk(sk);
419 struct tcp_sock *tp = tcp_sk(sk);
420
421 tp->out_of_order_queue = RB_ROOT;
422 sk->tcp_rtx_queue = RB_ROOT;
423 tcp_init_xmit_timers(sk);
424 INIT_LIST_HEAD(&tp->tsq_node);
425 INIT_LIST_HEAD(&tp->tsorted_sent_queue);
426
427 icsk->icsk_rto = TCP_TIMEOUT_INIT;
428 icsk->icsk_rto_min = TCP_RTO_MIN;
429 icsk->icsk_delack_max = TCP_DELACK_MAX;
430 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
431 minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
432
433
434
435
436
437
438 tp->snd_cwnd = TCP_INIT_CWND;
439
440
441 tp->app_limited = ~0U;
442
443
444
445
446 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
447 tp->snd_cwnd_clamp = ~0;
448 tp->mss_cache = TCP_MSS_DEFAULT;
449
450 tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
451 tcp_assign_congestion_control(sk);
452
453 tp->tsoffset = 0;
454 tp->rack.reo_wnd_steps = 1;
455
456 sk->sk_write_space = sk_stream_write_space;
457 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
458
459 icsk->icsk_sync_mss = tcp_sync_mss;
460
461 WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]);
462 WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
463
464 sk_sockets_allocated_inc(sk);
465 sk->sk_route_forced_caps = NETIF_F_GSO;
466}
467EXPORT_SYMBOL(tcp_init_sock);
468
469static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
470{
471 struct sk_buff *skb = tcp_write_queue_tail(sk);
472
473 if (tsflags && skb) {
474 struct skb_shared_info *shinfo = skb_shinfo(skb);
475 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
476
477 sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
478 if (tsflags & SOF_TIMESTAMPING_TX_ACK)
479 tcb->txstamp_ack = 1;
480 if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
481 shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
482 }
483}
484
485static bool tcp_stream_is_readable(struct sock *sk, int target)
486{
487 if (tcp_epollin_ready(sk, target))
488 return true;
489
490 if (sk->sk_prot->stream_memory_read)
491 return sk->sk_prot->stream_memory_read(sk);
492 return false;
493}
494
495
496
497
498
499
500
501
502__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
503{
504 __poll_t mask;
505 struct sock *sk = sock->sk;
506 const struct tcp_sock *tp = tcp_sk(sk);
507 int state;
508
509 sock_poll_wait(file, sock, wait);
510
511 state = inet_sk_state_load(sk);
512 if (state == TCP_LISTEN)
513 return inet_csk_listen_poll(sk);
514
515
516
517
518
519
520 mask = 0;
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549 if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
550 mask |= EPOLLHUP;
551 if (sk->sk_shutdown & RCV_SHUTDOWN)
552 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
553
554
555 if (state != TCP_SYN_SENT &&
556 (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
557 int target = sock_rcvlowat(sk, 0, INT_MAX);
558
559 if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
560 !sock_flag(sk, SOCK_URGINLINE) &&
561 tp->urg_data)
562 target++;
563
564 if (tcp_stream_is_readable(sk, target))
565 mask |= EPOLLIN | EPOLLRDNORM;
566
567 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
568 if (__sk_stream_is_writeable(sk, 1)) {
569 mask |= EPOLLOUT | EPOLLWRNORM;
570 } else {
571 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
572 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
573
574
575
576
577
578
579 smp_mb__after_atomic();
580 if (__sk_stream_is_writeable(sk, 1))
581 mask |= EPOLLOUT | EPOLLWRNORM;
582 }
583 } else
584 mask |= EPOLLOUT | EPOLLWRNORM;
585
586 if (tp->urg_data & TCP_URG_VALID)
587 mask |= EPOLLPRI;
588 } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
589
590
591
592
593 mask |= EPOLLOUT | EPOLLWRNORM;
594 }
595
596 smp_rmb();
597 if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
598 mask |= EPOLLERR;
599
600 return mask;
601}
602EXPORT_SYMBOL(tcp_poll);
603
604int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
605{
606 struct tcp_sock *tp = tcp_sk(sk);
607 int answ;
608 bool slow;
609
610 switch (cmd) {
611 case SIOCINQ:
612 if (sk->sk_state == TCP_LISTEN)
613 return -EINVAL;
614
615 slow = lock_sock_fast(sk);
616 answ = tcp_inq(sk);
617 unlock_sock_fast(sk, slow);
618 break;
619 case SIOCATMARK:
620 answ = tp->urg_data &&
621 READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
622 break;
623 case SIOCOUTQ:
624 if (sk->sk_state == TCP_LISTEN)
625 return -EINVAL;
626
627 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
628 answ = 0;
629 else
630 answ = READ_ONCE(tp->write_seq) - tp->snd_una;
631 break;
632 case SIOCOUTQNSD:
633 if (sk->sk_state == TCP_LISTEN)
634 return -EINVAL;
635
636 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
637 answ = 0;
638 else
639 answ = READ_ONCE(tp->write_seq) -
640 READ_ONCE(tp->snd_nxt);
641 break;
642 default:
643 return -ENOIOCTLCMD;
644 }
645
646 return put_user(answ, (int __user *)arg);
647}
648EXPORT_SYMBOL(tcp_ioctl);
649
650static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
651{
652 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
653 tp->pushed_seq = tp->write_seq;
654}
655
656static inline bool forced_push(const struct tcp_sock *tp)
657{
658 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
659}
660
661static void skb_entail(struct sock *sk, struct sk_buff *skb)
662{
663 struct tcp_sock *tp = tcp_sk(sk);
664 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
665
666 skb->csum = 0;
667 tcb->seq = tcb->end_seq = tp->write_seq;
668 tcb->tcp_flags = TCPHDR_ACK;
669 tcb->sacked = 0;
670 __skb_header_release(skb);
671 tcp_add_write_queue_tail(sk, skb);
672 sk_wmem_queued_add(sk, skb->truesize);
673 sk_mem_charge(sk, skb->truesize);
674 if (tp->nonagle & TCP_NAGLE_PUSH)
675 tp->nonagle &= ~TCP_NAGLE_PUSH;
676
677 tcp_slow_start_after_idle_check(sk);
678}
679
680static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
681{
682 if (flags & MSG_OOB)
683 tp->snd_up = tp->write_seq;
684}
685
686
687
688
689
690
691
692
693
694
695
696static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
697 int size_goal)
698{
699 return skb->len < size_goal &&
700 sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
701 !tcp_rtx_queue_empty(sk) &&
702 refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
703}
704
705void tcp_push(struct sock *sk, int flags, int mss_now,
706 int nonagle, int size_goal)
707{
708 struct tcp_sock *tp = tcp_sk(sk);
709 struct sk_buff *skb;
710
711 skb = tcp_write_queue_tail(sk);
712 if (!skb)
713 return;
714 if (!(flags & MSG_MORE) || forced_push(tp))
715 tcp_mark_push(tp, skb);
716
717 tcp_mark_urg(tp, flags);
718
719 if (tcp_should_autocork(sk, skb, size_goal)) {
720
721
722 if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
723 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
724 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
725 }
726
727
728
729 if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
730 return;
731 }
732
733 if (flags & MSG_MORE)
734 nonagle = TCP_NAGLE_CORK;
735
736 __tcp_push_pending_frames(sk, mss_now, nonagle);
737}
738
739static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
740 unsigned int offset, size_t len)
741{
742 struct tcp_splice_state *tss = rd_desc->arg.data;
743 int ret;
744
745 ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
746 min(rd_desc->count, len), tss->flags);
747 if (ret > 0)
748 rd_desc->count -= ret;
749 return ret;
750}
751
752static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
753{
754
755 read_descriptor_t rd_desc = {
756 .arg.data = tss,
757 .count = tss->len,
758 };
759
760 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
761}
762
763
764
765
766
767
768
769
770
771
772
773
774
775ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
776 struct pipe_inode_info *pipe, size_t len,
777 unsigned int flags)
778{
779 struct sock *sk = sock->sk;
780 struct tcp_splice_state tss = {
781 .pipe = pipe,
782 .len = len,
783 .flags = flags,
784 };
785 long timeo;
786 ssize_t spliced;
787 int ret;
788
789 sock_rps_record_flow(sk);
790
791
792
793 if (unlikely(*ppos))
794 return -ESPIPE;
795
796 ret = spliced = 0;
797
798 lock_sock(sk);
799
800 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
801 while (tss.len) {
802 ret = __tcp_splice_read(sk, &tss);
803 if (ret < 0)
804 break;
805 else if (!ret) {
806 if (spliced)
807 break;
808 if (sock_flag(sk, SOCK_DONE))
809 break;
810 if (sk->sk_err) {
811 ret = sock_error(sk);
812 break;
813 }
814 if (sk->sk_shutdown & RCV_SHUTDOWN)
815 break;
816 if (sk->sk_state == TCP_CLOSE) {
817
818
819
820
821 ret = -ENOTCONN;
822 break;
823 }
824 if (!timeo) {
825 ret = -EAGAIN;
826 break;
827 }
828
829
830
831
832 if (!skb_queue_empty(&sk->sk_receive_queue))
833 break;
834 sk_wait_data(sk, &timeo, NULL);
835 if (signal_pending(current)) {
836 ret = sock_intr_errno(timeo);
837 break;
838 }
839 continue;
840 }
841 tss.len -= ret;
842 spliced += ret;
843
844 if (!timeo)
845 break;
846 release_sock(sk);
847 lock_sock(sk);
848
849 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
850 (sk->sk_shutdown & RCV_SHUTDOWN) ||
851 signal_pending(current))
852 break;
853 }
854
855 release_sock(sk);
856
857 if (spliced)
858 return spliced;
859
860 return ret;
861}
862EXPORT_SYMBOL(tcp_splice_read);
863
864struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
865 bool force_schedule)
866{
867 struct sk_buff *skb;
868
869 if (likely(!size)) {
870 skb = sk->sk_tx_skb_cache;
871 if (skb) {
872 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
873 sk->sk_tx_skb_cache = NULL;
874 pskb_trim(skb, 0);
875 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
876 skb_shinfo(skb)->tx_flags = 0;
877 memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb));
878 return skb;
879 }
880 }
881
882 size = ALIGN(size, 4);
883
884 if (unlikely(tcp_under_memory_pressure(sk)))
885 sk_mem_reclaim_partial(sk);
886
887 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
888 if (likely(skb)) {
889 bool mem_scheduled;
890
891 if (force_schedule) {
892 mem_scheduled = true;
893 sk_forced_mem_schedule(sk, skb->truesize);
894 } else {
895 mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
896 }
897 if (likely(mem_scheduled)) {
898 skb_reserve(skb, sk->sk_prot->max_header);
899
900
901
902
903 skb->reserved_tailroom = skb->end - skb->tail - size;
904 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
905 return skb;
906 }
907 __kfree_skb(skb);
908 } else {
909 sk->sk_prot->enter_memory_pressure(sk);
910 sk_stream_moderate_sndbuf(sk);
911 }
912 return NULL;
913}
914
915static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
916 int large_allowed)
917{
918 struct tcp_sock *tp = tcp_sk(sk);
919 u32 new_size_goal, size_goal;
920
921 if (!large_allowed)
922 return mss_now;
923
924
925 new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
926 new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
927
928
929 size_goal = tp->gso_segs * mss_now;
930 if (unlikely(new_size_goal < size_goal ||
931 new_size_goal >= size_goal + mss_now)) {
932 tp->gso_segs = min_t(u16, new_size_goal / mss_now,
933 sk->sk_gso_max_segs);
934 size_goal = tp->gso_segs * mss_now;
935 }
936
937 return max(size_goal, mss_now);
938}
939
940int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
941{
942 int mss_now;
943
944 mss_now = tcp_current_mss(sk);
945 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
946
947 return mss_now;
948}
949
950
951
952
953
954
955
956void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
957{
958 if (skb && !skb->len) {
959 tcp_unlink_write_queue(skb, sk);
960 if (tcp_write_queue_empty(sk))
961 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
962 sk_wmem_free_skb(sk, skb);
963 }
964}
965
966struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
967 struct page *page, int offset, size_t *size)
968{
969 struct sk_buff *skb = tcp_write_queue_tail(sk);
970 struct tcp_sock *tp = tcp_sk(sk);
971 bool can_coalesce;
972 int copy, i;
973
974 if (!skb || (copy = size_goal - skb->len) <= 0 ||
975 !tcp_skb_can_collapse_to(skb)) {
976new_segment:
977 if (!sk_stream_memory_free(sk))
978 return NULL;
979
980 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
981 tcp_rtx_and_write_queues_empty(sk));
982 if (!skb)
983 return NULL;
984
985#ifdef CONFIG_TLS_DEVICE
986 skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
987#endif
988 skb_entail(sk, skb);
989 copy = size_goal;
990 }
991
992 if (copy > *size)
993 copy = *size;
994
995 i = skb_shinfo(skb)->nr_frags;
996 can_coalesce = skb_can_coalesce(skb, i, page, offset);
997 if (!can_coalesce && i >= sysctl_max_skb_frags) {
998 tcp_mark_push(tp, skb);
999 goto new_segment;
1000 }
1001 if (!sk_wmem_schedule(sk, copy))
1002 return NULL;
1003
1004 if (can_coalesce) {
1005 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1006 } else {
1007 get_page(page);
1008 skb_fill_page_desc(skb, i, page, offset, copy);
1009 }
1010
1011 if (!(flags & MSG_NO_SHARED_FRAGS))
1012 skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
1013
1014 skb->len += copy;
1015 skb->data_len += copy;
1016 skb->truesize += copy;
1017 sk_wmem_queued_add(sk, copy);
1018 sk_mem_charge(sk, copy);
1019 skb->ip_summed = CHECKSUM_PARTIAL;
1020 WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
1021 TCP_SKB_CB(skb)->end_seq += copy;
1022 tcp_skb_pcount_set(skb, 0);
1023
1024 *size = copy;
1025 return skb;
1026}
1027
1028ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
1029 size_t size, int flags)
1030{
1031 struct tcp_sock *tp = tcp_sk(sk);
1032 int mss_now, size_goal;
1033 int err;
1034 ssize_t copied;
1035 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1036
1037 if (IS_ENABLED(CONFIG_DEBUG_VM) &&
1038 WARN_ONCE(!sendpage_ok(page),
1039 "page must not be a Slab one and have page_count > 0"))
1040 return -EINVAL;
1041
1042
1043
1044
1045
1046 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1047 !tcp_passive_fastopen(sk)) {
1048 err = sk_stream_wait_connect(sk, &timeo);
1049 if (err != 0)
1050 goto out_err;
1051 }
1052
1053 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1054
1055 mss_now = tcp_send_mss(sk, &size_goal, flags);
1056 copied = 0;
1057
1058 err = -EPIPE;
1059 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1060 goto out_err;
1061
1062 while (size > 0) {
1063 struct sk_buff *skb;
1064 size_t copy = size;
1065
1066 skb = tcp_build_frag(sk, size_goal, flags, page, offset, ©);
1067 if (!skb)
1068 goto wait_for_space;
1069
1070 if (!copied)
1071 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1072
1073 copied += copy;
1074 offset += copy;
1075 size -= copy;
1076 if (!size)
1077 goto out;
1078
1079 if (skb->len < size_goal || (flags & MSG_OOB))
1080 continue;
1081
1082 if (forced_push(tp)) {
1083 tcp_mark_push(tp, skb);
1084 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1085 } else if (skb == tcp_send_head(sk))
1086 tcp_push_one(sk, mss_now);
1087 continue;
1088
1089wait_for_space:
1090 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1091 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1092 TCP_NAGLE_PUSH, size_goal);
1093
1094 err = sk_stream_wait_memory(sk, &timeo);
1095 if (err != 0)
1096 goto do_error;
1097
1098 mss_now = tcp_send_mss(sk, &size_goal, flags);
1099 }
1100
1101out:
1102 if (copied) {
1103 tcp_tx_timestamp(sk, sk->sk_tsflags);
1104 if (!(flags & MSG_SENDPAGE_NOTLAST))
1105 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1106 }
1107 return copied;
1108
1109do_error:
1110 tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk));
1111 if (copied)
1112 goto out;
1113out_err:
1114
1115 if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
1116 sk->sk_write_space(sk);
1117 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1118 }
1119 return sk_stream_error(sk, flags, err);
1120}
1121EXPORT_SYMBOL_GPL(do_tcp_sendpages);
1122
1123int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
1124 size_t size, int flags)
1125{
1126 if (!(sk->sk_route_caps & NETIF_F_SG))
1127 return sock_no_sendpage_locked(sk, page, offset, size, flags);
1128
1129 tcp_rate_check_app_limited(sk);
1130
1131 return do_tcp_sendpages(sk, page, offset, size, flags);
1132}
1133EXPORT_SYMBOL_GPL(tcp_sendpage_locked);
1134
1135int tcp_sendpage(struct sock *sk, struct page *page, int offset,
1136 size_t size, int flags)
1137{
1138 int ret;
1139
1140 lock_sock(sk);
1141 ret = tcp_sendpage_locked(sk, page, offset, size, flags);
1142 release_sock(sk);
1143
1144 return ret;
1145}
1146EXPORT_SYMBOL(tcp_sendpage);
1147
1148void tcp_free_fastopen_req(struct tcp_sock *tp)
1149{
1150 if (tp->fastopen_req) {
1151 kfree(tp->fastopen_req);
1152 tp->fastopen_req = NULL;
1153 }
1154}
1155
1156static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1157 int *copied, size_t size,
1158 struct ubuf_info *uarg)
1159{
1160 struct tcp_sock *tp = tcp_sk(sk);
1161 struct inet_sock *inet = inet_sk(sk);
1162 struct sockaddr *uaddr = msg->msg_name;
1163 int err, flags;
1164
1165 if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
1166 (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
1167 uaddr->sa_family == AF_UNSPEC))
1168 return -EOPNOTSUPP;
1169 if (tp->fastopen_req)
1170 return -EALREADY;
1171
1172 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1173 sk->sk_allocation);
1174 if (unlikely(!tp->fastopen_req))
1175 return -ENOBUFS;
1176 tp->fastopen_req->data = msg;
1177 tp->fastopen_req->size = size;
1178 tp->fastopen_req->uarg = uarg;
1179
1180 if (inet->defer_connect) {
1181 err = tcp_connect(sk);
1182
1183 if (err) {
1184 tcp_set_state(sk, TCP_CLOSE);
1185 inet->inet_dport = 0;
1186 sk->sk_route_caps = 0;
1187 }
1188 }
1189 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1190 err = __inet_stream_connect(sk->sk_socket, uaddr,
1191 msg->msg_namelen, flags, 1);
1192
1193
1194
1195 if (tp->fastopen_req) {
1196 *copied = tp->fastopen_req->copied;
1197 tcp_free_fastopen_req(tp);
1198 inet->defer_connect = 0;
1199 }
1200 return err;
1201}
1202
1203int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
1204{
1205 struct tcp_sock *tp = tcp_sk(sk);
1206 struct ubuf_info *uarg = NULL;
1207 struct sk_buff *skb;
1208 struct sockcm_cookie sockc;
1209 int flags, err, copied = 0;
1210 int mss_now = 0, size_goal, copied_syn = 0;
1211 int process_backlog = 0;
1212 bool zc = false;
1213 long timeo;
1214
1215 flags = msg->msg_flags;
1216
1217 if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
1218 skb = tcp_write_queue_tail(sk);
1219 uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
1220 if (!uarg) {
1221 err = -ENOBUFS;
1222 goto out_err;
1223 }
1224
1225 zc = sk->sk_route_caps & NETIF_F_SG;
1226 if (!zc)
1227 uarg->zerocopy = 0;
1228 }
1229
1230 if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
1231 !tp->repair) {
1232 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
1233 if (err == -EINPROGRESS && copied_syn > 0)
1234 goto out;
1235 else if (err)
1236 goto out_err;
1237 }
1238
1239 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1240
1241 tcp_rate_check_app_limited(sk);
1242
1243
1244
1245
1246
1247 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1248 !tcp_passive_fastopen(sk)) {
1249 err = sk_stream_wait_connect(sk, &timeo);
1250 if (err != 0)
1251 goto do_error;
1252 }
1253
1254 if (unlikely(tp->repair)) {
1255 if (tp->repair_queue == TCP_RECV_QUEUE) {
1256 copied = tcp_send_rcvq(sk, msg, size);
1257 goto out_nopush;
1258 }
1259
1260 err = -EINVAL;
1261 if (tp->repair_queue == TCP_NO_QUEUE)
1262 goto out_err;
1263
1264
1265 }
1266
1267 sockcm_init(&sockc, sk);
1268 if (msg->msg_controllen) {
1269 err = sock_cmsg_send(sk, msg, &sockc);
1270 if (unlikely(err)) {
1271 err = -EINVAL;
1272 goto out_err;
1273 }
1274 }
1275
1276
1277 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1278
1279
1280 copied = 0;
1281
1282restart:
1283 mss_now = tcp_send_mss(sk, &size_goal, flags);
1284
1285 err = -EPIPE;
1286 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1287 goto do_error;
1288
1289 while (msg_data_left(msg)) {
1290 int copy = 0;
1291
1292 skb = tcp_write_queue_tail(sk);
1293 if (skb)
1294 copy = size_goal - skb->len;
1295
1296 if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
1297 bool first_skb;
1298
1299new_segment:
1300 if (!sk_stream_memory_free(sk))
1301 goto wait_for_space;
1302
1303 if (unlikely(process_backlog >= 16)) {
1304 process_backlog = 0;
1305 if (sk_flush_backlog(sk))
1306 goto restart;
1307 }
1308 first_skb = tcp_rtx_and_write_queues_empty(sk);
1309 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
1310 first_skb);
1311 if (!skb)
1312 goto wait_for_space;
1313
1314 process_backlog++;
1315 skb->ip_summed = CHECKSUM_PARTIAL;
1316
1317 skb_entail(sk, skb);
1318 copy = size_goal;
1319
1320
1321
1322
1323
1324 if (tp->repair)
1325 TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
1326 }
1327
1328
1329 if (copy > msg_data_left(msg))
1330 copy = msg_data_left(msg);
1331
1332
1333 if (skb_availroom(skb) > 0 && !zc) {
1334
1335 copy = min_t(int, copy, skb_availroom(skb));
1336 err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
1337 if (err)
1338 goto do_fault;
1339 } else if (!zc) {
1340 bool merge = true;
1341 int i = skb_shinfo(skb)->nr_frags;
1342 struct page_frag *pfrag = sk_page_frag(sk);
1343
1344 if (!sk_page_frag_refill(sk, pfrag))
1345 goto wait_for_space;
1346
1347 if (!skb_can_coalesce(skb, i, pfrag->page,
1348 pfrag->offset)) {
1349 if (i >= sysctl_max_skb_frags) {
1350 tcp_mark_push(tp, skb);
1351 goto new_segment;
1352 }
1353 merge = false;
1354 }
1355
1356 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1357
1358 if (!sk_wmem_schedule(sk, copy))
1359 goto wait_for_space;
1360
1361 err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
1362 pfrag->page,
1363 pfrag->offset,
1364 copy);
1365 if (err)
1366 goto do_error;
1367
1368
1369 if (merge) {
1370 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1371 } else {
1372 skb_fill_page_desc(skb, i, pfrag->page,
1373 pfrag->offset, copy);
1374 page_ref_inc(pfrag->page);
1375 }
1376 pfrag->offset += copy;
1377 } else {
1378 if (!sk_wmem_schedule(sk, copy))
1379 goto wait_for_space;
1380
1381 err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
1382 if (err == -EMSGSIZE || err == -EEXIST) {
1383 tcp_mark_push(tp, skb);
1384 goto new_segment;
1385 }
1386 if (err < 0)
1387 goto do_error;
1388 copy = err;
1389 }
1390
1391 if (!copied)
1392 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1393
1394 WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
1395 TCP_SKB_CB(skb)->end_seq += copy;
1396 tcp_skb_pcount_set(skb, 0);
1397
1398 copied += copy;
1399 if (!msg_data_left(msg)) {
1400 if (unlikely(flags & MSG_EOR))
1401 TCP_SKB_CB(skb)->eor = 1;
1402 goto out;
1403 }
1404
1405 if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair))
1406 continue;
1407
1408 if (forced_push(tp)) {
1409 tcp_mark_push(tp, skb);
1410 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1411 } else if (skb == tcp_send_head(sk))
1412 tcp_push_one(sk, mss_now);
1413 continue;
1414
1415wait_for_space:
1416 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1417 if (copied)
1418 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1419 TCP_NAGLE_PUSH, size_goal);
1420
1421 err = sk_stream_wait_memory(sk, &timeo);
1422 if (err != 0)
1423 goto do_error;
1424
1425 mss_now = tcp_send_mss(sk, &size_goal, flags);
1426 }
1427
1428out:
1429 if (copied) {
1430 tcp_tx_timestamp(sk, sockc.tsflags);
1431 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1432 }
1433out_nopush:
1434 net_zcopy_put(uarg);
1435 return copied + copied_syn;
1436
1437do_error:
1438 skb = tcp_write_queue_tail(sk);
1439do_fault:
1440 tcp_remove_empty_skb(sk, skb);
1441
1442 if (copied + copied_syn)
1443 goto out;
1444out_err:
1445 net_zcopy_put_abort(uarg, true);
1446 err = sk_stream_error(sk, flags, err);
1447
1448 if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
1449 sk->sk_write_space(sk);
1450 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1451 }
1452 return err;
1453}
1454EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
1455
1456int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
1457{
1458 int ret;
1459
1460 lock_sock(sk);
1461 ret = tcp_sendmsg_locked(sk, msg, size);
1462 release_sock(sk);
1463
1464 return ret;
1465}
1466EXPORT_SYMBOL(tcp_sendmsg);
1467
1468
1469
1470
1471
1472
1473static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1474{
1475 struct tcp_sock *tp = tcp_sk(sk);
1476
1477
1478 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1479 tp->urg_data == TCP_URG_READ)
1480 return -EINVAL;
1481
1482 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1483 return -ENOTCONN;
1484
1485 if (tp->urg_data & TCP_URG_VALID) {
1486 int err = 0;
1487 char c = tp->urg_data;
1488
1489 if (!(flags & MSG_PEEK))
1490 tp->urg_data = TCP_URG_READ;
1491
1492
1493 msg->msg_flags |= MSG_OOB;
1494
1495 if (len > 0) {
1496 if (!(flags & MSG_TRUNC))
1497 err = memcpy_to_msg(msg, &c, 1);
1498 len = 1;
1499 } else
1500 msg->msg_flags |= MSG_TRUNC;
1501
1502 return err ? -EFAULT : len;
1503 }
1504
1505 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1506 return 0;
1507
1508
1509
1510
1511
1512
1513
1514 return -EAGAIN;
1515}
1516
1517static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1518{
1519 struct sk_buff *skb;
1520 int copied = 0, err = 0;
1521
1522
1523
1524 skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
1525 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1526 if (err)
1527 return err;
1528 copied += skb->len;
1529 }
1530
1531 skb_queue_walk(&sk->sk_write_queue, skb) {
1532 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1533 if (err)
1534 break;
1535
1536 copied += skb->len;
1537 }
1538
1539 return err ?: copied;
1540}
1541
1542
1543
1544
1545
1546
1547
1548void tcp_cleanup_rbuf(struct sock *sk, int copied)
1549{
1550 struct tcp_sock *tp = tcp_sk(sk);
1551 bool time_to_ack = false;
1552
1553 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1554
1555 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1556 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1557 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1558
1559 if (inet_csk_ack_scheduled(sk)) {
1560 const struct inet_connection_sock *icsk = inet_csk(sk);
1561
1562 if (
1563 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1564
1565
1566
1567
1568
1569
1570 (copied > 0 &&
1571 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1572 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1573 !inet_csk_in_pingpong_mode(sk))) &&
1574 !atomic_read(&sk->sk_rmem_alloc)))
1575 time_to_ack = true;
1576 }
1577
1578
1579
1580
1581
1582
1583
1584 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1585 __u32 rcv_window_now = tcp_receive_window(tp);
1586
1587
1588 if (2*rcv_window_now <= tp->window_clamp) {
1589 __u32 new_window = __tcp_select_window(sk);
1590
1591
1592
1593
1594
1595
1596 if (new_window && new_window >= 2 * rcv_window_now)
1597 time_to_ack = true;
1598 }
1599 }
1600 if (time_to_ack)
1601 tcp_send_ack(sk);
1602}
1603
1604static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1605{
1606 struct sk_buff *skb;
1607 u32 offset;
1608
1609 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1610 offset = seq - TCP_SKB_CB(skb)->seq;
1611 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
1612 pr_err_once("%s: found a SYN, please report !\n", __func__);
1613 offset--;
1614 }
1615 if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
1616 *off = offset;
1617 return skb;
1618 }
1619
1620
1621
1622
1623 sk_eat_skb(sk, skb);
1624 }
1625 return NULL;
1626}
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1640 sk_read_actor_t recv_actor)
1641{
1642 struct sk_buff *skb;
1643 struct tcp_sock *tp = tcp_sk(sk);
1644 u32 seq = tp->copied_seq;
1645 u32 offset;
1646 int copied = 0;
1647
1648 if (sk->sk_state == TCP_LISTEN)
1649 return -ENOTCONN;
1650 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1651 if (offset < skb->len) {
1652 int used;
1653 size_t len;
1654
1655 len = skb->len - offset;
1656
1657 if (tp->urg_data) {
1658 u32 urg_offset = tp->urg_seq - seq;
1659 if (urg_offset < len)
1660 len = urg_offset;
1661 if (!len)
1662 break;
1663 }
1664 used = recv_actor(desc, skb, offset, len);
1665 if (used <= 0) {
1666 if (!copied)
1667 copied = used;
1668 break;
1669 } else if (used <= len) {
1670 seq += used;
1671 copied += used;
1672 offset += used;
1673 }
1674
1675
1676
1677
1678
1679 skb = tcp_recv_skb(sk, seq - 1, &offset);
1680 if (!skb)
1681 break;
1682
1683
1684
1685 if (offset + 1 != skb->len)
1686 continue;
1687 }
1688 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1689 sk_eat_skb(sk, skb);
1690 ++seq;
1691 break;
1692 }
1693 sk_eat_skb(sk, skb);
1694 if (!desc->count)
1695 break;
1696 WRITE_ONCE(tp->copied_seq, seq);
1697 }
1698 WRITE_ONCE(tp->copied_seq, seq);
1699
1700 tcp_rcv_space_adjust(sk);
1701
1702
1703 if (copied > 0) {
1704 tcp_recv_skb(sk, seq, &offset);
1705 tcp_cleanup_rbuf(sk, copied);
1706 }
1707 return copied;
1708}
1709EXPORT_SYMBOL(tcp_read_sock);
1710
1711int tcp_peek_len(struct socket *sock)
1712{
1713 return tcp_inq(sock->sk);
1714}
1715EXPORT_SYMBOL(tcp_peek_len);
1716
1717
1718int tcp_set_rcvlowat(struct sock *sk, int val)
1719{
1720 int cap;
1721
1722 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1723 cap = sk->sk_rcvbuf >> 1;
1724 else
1725 cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1;
1726 val = min(val, cap);
1727 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1728
1729
1730 tcp_data_ready(sk);
1731
1732 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1733 return 0;
1734
1735 val <<= 1;
1736 if (val > sk->sk_rcvbuf) {
1737 WRITE_ONCE(sk->sk_rcvbuf, val);
1738 tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
1739 }
1740 return 0;
1741}
1742EXPORT_SYMBOL(tcp_set_rcvlowat);
1743
1744void tcp_update_recv_tstamps(struct sk_buff *skb,
1745 struct scm_timestamping_internal *tss)
1746{
1747 if (skb->tstamp)
1748 tss->ts[0] = ktime_to_timespec64(skb->tstamp);
1749 else
1750 tss->ts[0] = (struct timespec64) {0};
1751
1752 if (skb_hwtstamps(skb)->hwtstamp)
1753 tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
1754 else
1755 tss->ts[2] = (struct timespec64) {0};
1756}
1757
1758#ifdef CONFIG_MMU
1759static const struct vm_operations_struct tcp_vm_ops = {
1760};
1761
1762int tcp_mmap(struct file *file, struct socket *sock,
1763 struct vm_area_struct *vma)
1764{
1765 if (vma->vm_flags & (VM_WRITE | VM_EXEC))
1766 return -EPERM;
1767 vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
1768
1769
1770 vma->vm_flags |= VM_MIXEDMAP;
1771
1772 vma->vm_ops = &tcp_vm_ops;
1773 return 0;
1774}
1775EXPORT_SYMBOL(tcp_mmap);
1776
1777static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb,
1778 u32 *offset_frag)
1779{
1780 skb_frag_t *frag;
1781
1782 offset_skb -= skb_headlen(skb);
1783 if ((int)offset_skb < 0 || skb_has_frag_list(skb))
1784 return NULL;
1785
1786 frag = skb_shinfo(skb)->frags;
1787 while (offset_skb) {
1788 if (skb_frag_size(frag) > offset_skb) {
1789 *offset_frag = offset_skb;
1790 return frag;
1791 }
1792 offset_skb -= skb_frag_size(frag);
1793 ++frag;
1794 }
1795 *offset_frag = 0;
1796 return frag;
1797}
1798
1799static bool can_map_frag(const skb_frag_t *frag)
1800{
1801 return skb_frag_size(frag) == PAGE_SIZE && !skb_frag_off(frag);
1802}
1803
1804static int find_next_mappable_frag(const skb_frag_t *frag,
1805 int remaining_in_skb)
1806{
1807 int offset = 0;
1808
1809 if (likely(can_map_frag(frag)))
1810 return 0;
1811
1812 while (offset < remaining_in_skb && !can_map_frag(frag)) {
1813 offset += skb_frag_size(frag);
1814 ++frag;
1815 }
1816 return offset;
1817}
1818
1819static void tcp_zerocopy_set_hint_for_skb(struct sock *sk,
1820 struct tcp_zerocopy_receive *zc,
1821 struct sk_buff *skb, u32 offset)
1822{
1823 u32 frag_offset, partial_frag_remainder = 0;
1824 int mappable_offset;
1825 skb_frag_t *frag;
1826
1827
1828 zc->recv_skip_hint = skb->len - offset;
1829
1830
1831 frag = skb_advance_to_frag(skb, offset, &frag_offset);
1832 if (!frag)
1833 return;
1834
1835 if (frag_offset) {
1836 struct skb_shared_info *info = skb_shinfo(skb);
1837
1838
1839 if (frag == &info->frags[info->nr_frags - 1])
1840 return;
1841
1842
1843 partial_frag_remainder = skb_frag_size(frag) - frag_offset;
1844 zc->recv_skip_hint -= partial_frag_remainder;
1845 ++frag;
1846 }
1847
1848
1849
1850
1851
1852 mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint);
1853 zc->recv_skip_hint = mappable_offset + partial_frag_remainder;
1854}
1855
1856static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
1857 int nonblock, int flags,
1858 struct scm_timestamping_internal *tss,
1859 int *cmsg_flags);
1860static int receive_fallback_to_copy(struct sock *sk,
1861 struct tcp_zerocopy_receive *zc, int inq,
1862 struct scm_timestamping_internal *tss)
1863{
1864 unsigned long copy_address = (unsigned long)zc->copybuf_address;
1865 struct msghdr msg = {};
1866 struct iovec iov;
1867 int err;
1868
1869 zc->length = 0;
1870 zc->recv_skip_hint = 0;
1871
1872 if (copy_address != zc->copybuf_address)
1873 return -EINVAL;
1874
1875 err = import_single_range(READ, (void __user *)copy_address,
1876 inq, &iov, &msg.msg_iter);
1877 if (err)
1878 return err;
1879
1880 err = tcp_recvmsg_locked(sk, &msg, inq, 1, 0,
1881 tss, &zc->msg_flags);
1882 if (err < 0)
1883 return err;
1884
1885 zc->copybuf_len = err;
1886 if (likely(zc->copybuf_len)) {
1887 struct sk_buff *skb;
1888 u32 offset;
1889
1890 skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset);
1891 if (skb)
1892 tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset);
1893 }
1894 return 0;
1895}
1896
1897static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
1898 struct sk_buff *skb, u32 copylen,
1899 u32 *offset, u32 *seq)
1900{
1901 unsigned long copy_address = (unsigned long)zc->copybuf_address;
1902 struct msghdr msg = {};
1903 struct iovec iov;
1904 int err;
1905
1906 if (copy_address != zc->copybuf_address)
1907 return -EINVAL;
1908
1909 err = import_single_range(READ, (void __user *)copy_address,
1910 copylen, &iov, &msg.msg_iter);
1911 if (err)
1912 return err;
1913 err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
1914 if (err)
1915 return err;
1916 zc->recv_skip_hint -= copylen;
1917 *offset += copylen;
1918 *seq += copylen;
1919 return (__s32)copylen;
1920}
1921
1922static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc,
1923 struct sock *sk,
1924 struct sk_buff *skb,
1925 u32 *seq,
1926 s32 copybuf_len,
1927 struct scm_timestamping_internal *tss)
1928{
1929 u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
1930
1931 if (!copylen)
1932 return 0;
1933
1934 if (skb) {
1935 offset = *seq - TCP_SKB_CB(skb)->seq;
1936 } else {
1937 skb = tcp_recv_skb(sk, *seq, &offset);
1938 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1939 tcp_update_recv_tstamps(skb, tss);
1940 zc->msg_flags |= TCP_CMSG_TS;
1941 }
1942 }
1943
1944 zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
1945 seq);
1946 return zc->copybuf_len < 0 ? 0 : copylen;
1947}
1948
1949static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
1950 struct page **pending_pages,
1951 unsigned long pages_remaining,
1952 unsigned long *address,
1953 u32 *length,
1954 u32 *seq,
1955 struct tcp_zerocopy_receive *zc,
1956 u32 total_bytes_to_map,
1957 int err)
1958{
1959
1960 if (err == -EBUSY &&
1961 zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) {
1962 u32 maybe_zap_len;
1963
1964 maybe_zap_len = total_bytes_to_map -
1965 *length +
1966 (pages_remaining * PAGE_SIZE);
1967 zap_page_range(vma, *address, maybe_zap_len);
1968 err = 0;
1969 }
1970
1971 if (!err) {
1972 unsigned long leftover_pages = pages_remaining;
1973 int bytes_mapped;
1974
1975
1976 err = vm_insert_pages(vma, *address,
1977 pending_pages,
1978 &pages_remaining);
1979 bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining);
1980 *seq += bytes_mapped;
1981 *address += bytes_mapped;
1982 }
1983 if (err) {
1984
1985
1986
1987
1988
1989 const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
1990
1991 *length -= bytes_not_mapped;
1992 zc->recv_skip_hint += bytes_not_mapped;
1993 }
1994 return err;
1995}
1996
1997static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
1998 struct page **pages,
1999 unsigned int pages_to_map,
2000 unsigned long *address,
2001 u32 *length,
2002 u32 *seq,
2003 struct tcp_zerocopy_receive *zc,
2004 u32 total_bytes_to_map)
2005{
2006 unsigned long pages_remaining = pages_to_map;
2007 unsigned int pages_mapped;
2008 unsigned int bytes_mapped;
2009 int err;
2010
2011 err = vm_insert_pages(vma, *address, pages, &pages_remaining);
2012 pages_mapped = pages_to_map - (unsigned int)pages_remaining;
2013 bytes_mapped = PAGE_SIZE * pages_mapped;
2014
2015
2016
2017 *seq += bytes_mapped;
2018 *address += bytes_mapped;
2019
2020 if (likely(!err))
2021 return 0;
2022
2023
2024 return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped,
2025 pages_remaining, address, length, seq, zc, total_bytes_to_map,
2026 err);
2027}
2028
2029#define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS)
2030static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
2031 struct tcp_zerocopy_receive *zc,
2032 struct scm_timestamping_internal *tss)
2033{
2034 unsigned long msg_control_addr;
2035 struct msghdr cmsg_dummy;
2036
2037 msg_control_addr = (unsigned long)zc->msg_control;
2038 cmsg_dummy.msg_control = (void *)msg_control_addr;
2039 cmsg_dummy.msg_controllen =
2040 (__kernel_size_t)zc->msg_controllen;
2041 cmsg_dummy.msg_flags = in_compat_syscall()
2042 ? MSG_CMSG_COMPAT : 0;
2043 cmsg_dummy.msg_control_is_user = true;
2044 zc->msg_flags = 0;
2045 if (zc->msg_control == msg_control_addr &&
2046 zc->msg_controllen == cmsg_dummy.msg_controllen) {
2047 tcp_recv_timestamp(&cmsg_dummy, sk, tss);
2048 zc->msg_control = (__u64)
2049 ((uintptr_t)cmsg_dummy.msg_control);
2050 zc->msg_controllen =
2051 (__u64)cmsg_dummy.msg_controllen;
2052 zc->msg_flags = (__u32)cmsg_dummy.msg_flags;
2053 }
2054}
2055
2056#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
2057static int tcp_zerocopy_receive(struct sock *sk,
2058 struct tcp_zerocopy_receive *zc,
2059 struct scm_timestamping_internal *tss)
2060{
2061 u32 length = 0, offset, vma_len, avail_len, copylen = 0;
2062 unsigned long address = (unsigned long)zc->address;
2063 struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE];
2064 s32 copybuf_len = zc->copybuf_len;
2065 struct tcp_sock *tp = tcp_sk(sk);
2066 const skb_frag_t *frags = NULL;
2067 unsigned int pages_to_map = 0;
2068 struct vm_area_struct *vma;
2069 struct sk_buff *skb = NULL;
2070 u32 seq = tp->copied_seq;
2071 u32 total_bytes_to_map;
2072 int inq = tcp_inq(sk);
2073 int ret;
2074
2075 zc->copybuf_len = 0;
2076 zc->msg_flags = 0;
2077
2078 if (address & (PAGE_SIZE - 1) || address != zc->address)
2079 return -EINVAL;
2080
2081 if (sk->sk_state == TCP_LISTEN)
2082 return -ENOTCONN;
2083
2084 sock_rps_record_flow(sk);
2085
2086 if (inq && inq <= copybuf_len)
2087 return receive_fallback_to_copy(sk, zc, inq, tss);
2088
2089 if (inq < PAGE_SIZE) {
2090 zc->length = 0;
2091 zc->recv_skip_hint = inq;
2092 if (!inq && sock_flag(sk, SOCK_DONE))
2093 return -EIO;
2094 return 0;
2095 }
2096
2097 mmap_read_lock(current->mm);
2098
2099 vma = vma_lookup(current->mm, address);
2100 if (!vma || vma->vm_ops != &tcp_vm_ops) {
2101 mmap_read_unlock(current->mm);
2102 return -EINVAL;
2103 }
2104 vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
2105 avail_len = min_t(u32, vma_len, inq);
2106 total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1);
2107 if (total_bytes_to_map) {
2108 if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT))
2109 zap_page_range(vma, address, total_bytes_to_map);
2110 zc->length = total_bytes_to_map;
2111 zc->recv_skip_hint = 0;
2112 } else {
2113 zc->length = avail_len;
2114 zc->recv_skip_hint = avail_len;
2115 }
2116 ret = 0;
2117 while (length + PAGE_SIZE <= zc->length) {
2118 int mappable_offset;
2119 struct page *page;
2120
2121 if (zc->recv_skip_hint < PAGE_SIZE) {
2122 u32 offset_frag;
2123
2124 if (skb) {
2125 if (zc->recv_skip_hint > 0)
2126 break;
2127 skb = skb->next;
2128 offset = seq - TCP_SKB_CB(skb)->seq;
2129 } else {
2130 skb = tcp_recv_skb(sk, seq, &offset);
2131 }
2132
2133 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2134 tcp_update_recv_tstamps(skb, tss);
2135 zc->msg_flags |= TCP_CMSG_TS;
2136 }
2137 zc->recv_skip_hint = skb->len - offset;
2138 frags = skb_advance_to_frag(skb, offset, &offset_frag);
2139 if (!frags || offset_frag)
2140 break;
2141 }
2142
2143 mappable_offset = find_next_mappable_frag(frags,
2144 zc->recv_skip_hint);
2145 if (mappable_offset) {
2146 zc->recv_skip_hint = mappable_offset;
2147 break;
2148 }
2149 page = skb_frag_page(frags);
2150 prefetchw(page);
2151 pages[pages_to_map++] = page;
2152 length += PAGE_SIZE;
2153 zc->recv_skip_hint -= PAGE_SIZE;
2154 frags++;
2155 if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE ||
2156 zc->recv_skip_hint < PAGE_SIZE) {
2157
2158
2159
2160 ret = tcp_zerocopy_vm_insert_batch(vma, pages,
2161 pages_to_map,
2162 &address, &length,
2163 &seq, zc,
2164 total_bytes_to_map);
2165 if (ret)
2166 goto out;
2167 pages_to_map = 0;
2168 }
2169 }
2170 if (pages_to_map) {
2171 ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map,
2172 &address, &length, &seq,
2173 zc, total_bytes_to_map);
2174 }
2175out:
2176 mmap_read_unlock(current->mm);
2177
2178 if (!ret)
2179 copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss);
2180
2181 if (length + copylen) {
2182 WRITE_ONCE(tp->copied_seq, seq);
2183 tcp_rcv_space_adjust(sk);
2184
2185
2186 tcp_recv_skb(sk, seq, &offset);
2187 tcp_cleanup_rbuf(sk, length + copylen);
2188 ret = 0;
2189 if (length == zc->length)
2190 zc->recv_skip_hint = 0;
2191 } else {
2192 if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
2193 ret = -EIO;
2194 }
2195 zc->length = length;
2196 return ret;
2197}
2198#endif
2199
2200
2201void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
2202 struct scm_timestamping_internal *tss)
2203{
2204 int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
2205 bool has_timestamping = false;
2206
2207 if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
2208 if (sock_flag(sk, SOCK_RCVTSTAMP)) {
2209 if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
2210 if (new_tstamp) {
2211 struct __kernel_timespec kts = {
2212 .tv_sec = tss->ts[0].tv_sec,
2213 .tv_nsec = tss->ts[0].tv_nsec,
2214 };
2215 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
2216 sizeof(kts), &kts);
2217 } else {
2218 struct __kernel_old_timespec ts_old = {
2219 .tv_sec = tss->ts[0].tv_sec,
2220 .tv_nsec = tss->ts[0].tv_nsec,
2221 };
2222 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
2223 sizeof(ts_old), &ts_old);
2224 }
2225 } else {
2226 if (new_tstamp) {
2227 struct __kernel_sock_timeval stv = {
2228 .tv_sec = tss->ts[0].tv_sec,
2229 .tv_usec = tss->ts[0].tv_nsec / 1000,
2230 };
2231 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
2232 sizeof(stv), &stv);
2233 } else {
2234 struct __kernel_old_timeval tv = {
2235 .tv_sec = tss->ts[0].tv_sec,
2236 .tv_usec = tss->ts[0].tv_nsec / 1000,
2237 };
2238 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
2239 sizeof(tv), &tv);
2240 }
2241 }
2242 }
2243
2244 if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
2245 has_timestamping = true;
2246 else
2247 tss->ts[0] = (struct timespec64) {0};
2248 }
2249
2250 if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
2251 if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
2252 has_timestamping = true;
2253 else
2254 tss->ts[2] = (struct timespec64) {0};
2255 }
2256
2257 if (has_timestamping) {
2258 tss->ts[1] = (struct timespec64) {0};
2259 if (sock_flag(sk, SOCK_TSTAMP_NEW))
2260 put_cmsg_scm_timestamping64(msg, tss);
2261 else
2262 put_cmsg_scm_timestamping(msg, tss);
2263 }
2264}
2265
2266static int tcp_inq_hint(struct sock *sk)
2267{
2268 const struct tcp_sock *tp = tcp_sk(sk);
2269 u32 copied_seq = READ_ONCE(tp->copied_seq);
2270 u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
2271 int inq;
2272
2273 inq = rcv_nxt - copied_seq;
2274 if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
2275 lock_sock(sk);
2276 inq = tp->rcv_nxt - tp->copied_seq;
2277 release_sock(sk);
2278 }
2279
2280
2281
2282 if (inq == 0 && sock_flag(sk, SOCK_DONE))
2283 inq = 1;
2284 return inq;
2285}
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
2296 int nonblock, int flags,
2297 struct scm_timestamping_internal *tss,
2298 int *cmsg_flags)
2299{
2300 struct tcp_sock *tp = tcp_sk(sk);
2301 int copied = 0;
2302 u32 peek_seq;
2303 u32 *seq;
2304 unsigned long used;
2305 int err;
2306 int target;
2307 long timeo;
2308 struct sk_buff *skb, *last;
2309 u32 urg_hole = 0;
2310
2311 err = -ENOTCONN;
2312 if (sk->sk_state == TCP_LISTEN)
2313 goto out;
2314
2315 if (tp->recvmsg_inq)
2316 *cmsg_flags = TCP_CMSG_INQ;
2317 timeo = sock_rcvtimeo(sk, nonblock);
2318
2319
2320 if (flags & MSG_OOB)
2321 goto recv_urg;
2322
2323 if (unlikely(tp->repair)) {
2324 err = -EPERM;
2325 if (!(flags & MSG_PEEK))
2326 goto out;
2327
2328 if (tp->repair_queue == TCP_SEND_QUEUE)
2329 goto recv_sndq;
2330
2331 err = -EINVAL;
2332 if (tp->repair_queue == TCP_NO_QUEUE)
2333 goto out;
2334
2335
2336 }
2337
2338 seq = &tp->copied_seq;
2339 if (flags & MSG_PEEK) {
2340 peek_seq = tp->copied_seq;
2341 seq = &peek_seq;
2342 }
2343
2344 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
2345
2346 do {
2347 u32 offset;
2348
2349
2350 if (tp->urg_data && tp->urg_seq == *seq) {
2351 if (copied)
2352 break;
2353 if (signal_pending(current)) {
2354 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
2355 break;
2356 }
2357 }
2358
2359
2360
2361 last = skb_peek_tail(&sk->sk_receive_queue);
2362 skb_queue_walk(&sk->sk_receive_queue, skb) {
2363 last = skb;
2364
2365
2366
2367 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
2368 "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
2369 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
2370 flags))
2371 break;
2372
2373 offset = *seq - TCP_SKB_CB(skb)->seq;
2374 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
2375 pr_err_once("%s: found a SYN, please report !\n", __func__);
2376 offset--;
2377 }
2378 if (offset < skb->len)
2379 goto found_ok_skb;
2380 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2381 goto found_fin_ok;
2382 WARN(!(flags & MSG_PEEK),
2383 "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
2384 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
2385 }
2386
2387
2388
2389 if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
2390 break;
2391
2392 if (copied) {
2393 if (sk->sk_err ||
2394 sk->sk_state == TCP_CLOSE ||
2395 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2396 !timeo ||
2397 signal_pending(current))
2398 break;
2399 } else {
2400 if (sock_flag(sk, SOCK_DONE))
2401 break;
2402
2403 if (sk->sk_err) {
2404 copied = sock_error(sk);
2405 break;
2406 }
2407
2408 if (sk->sk_shutdown & RCV_SHUTDOWN)
2409 break;
2410
2411 if (sk->sk_state == TCP_CLOSE) {
2412
2413
2414
2415 copied = -ENOTCONN;
2416 break;
2417 }
2418
2419 if (!timeo) {
2420 copied = -EAGAIN;
2421 break;
2422 }
2423
2424 if (signal_pending(current)) {
2425 copied = sock_intr_errno(timeo);
2426 break;
2427 }
2428 }
2429
2430 tcp_cleanup_rbuf(sk, copied);
2431
2432 if (copied >= target) {
2433
2434 release_sock(sk);
2435 lock_sock(sk);
2436 } else {
2437 sk_wait_data(sk, &timeo, last);
2438 }
2439
2440 if ((flags & MSG_PEEK) &&
2441 (peek_seq - copied - urg_hole != tp->copied_seq)) {
2442 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
2443 current->comm,
2444 task_pid_nr(current));
2445 peek_seq = tp->copied_seq;
2446 }
2447 continue;
2448
2449found_ok_skb:
2450
2451 used = skb->len - offset;
2452 if (len < used)
2453 used = len;
2454
2455
2456 if (tp->urg_data) {
2457 u32 urg_offset = tp->urg_seq - *seq;
2458 if (urg_offset < used) {
2459 if (!urg_offset) {
2460 if (!sock_flag(sk, SOCK_URGINLINE)) {
2461 WRITE_ONCE(*seq, *seq + 1);
2462 urg_hole++;
2463 offset++;
2464 used--;
2465 if (!used)
2466 goto skip_copy;
2467 }
2468 } else
2469 used = urg_offset;
2470 }
2471 }
2472
2473 if (!(flags & MSG_TRUNC)) {
2474 err = skb_copy_datagram_msg(skb, offset, msg, used);
2475 if (err) {
2476
2477 if (!copied)
2478 copied = -EFAULT;
2479 break;
2480 }
2481 }
2482
2483 WRITE_ONCE(*seq, *seq + used);
2484 copied += used;
2485 len -= used;
2486
2487 tcp_rcv_space_adjust(sk);
2488
2489skip_copy:
2490 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
2491 tp->urg_data = 0;
2492 tcp_fast_path_check(sk);
2493 }
2494
2495 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2496 tcp_update_recv_tstamps(skb, tss);
2497 *cmsg_flags |= TCP_CMSG_TS;
2498 }
2499
2500 if (used + offset < skb->len)
2501 continue;
2502
2503 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2504 goto found_fin_ok;
2505 if (!(flags & MSG_PEEK))
2506 sk_eat_skb(sk, skb);
2507 continue;
2508
2509found_fin_ok:
2510
2511 WRITE_ONCE(*seq, *seq + 1);
2512 if (!(flags & MSG_PEEK))
2513 sk_eat_skb(sk, skb);
2514 break;
2515 } while (len > 0);
2516
2517
2518
2519
2520
2521
2522 tcp_cleanup_rbuf(sk, copied);
2523 return copied;
2524
2525out:
2526 return err;
2527
2528recv_urg:
2529 err = tcp_recv_urg(sk, msg, len, flags);
2530 goto out;
2531
2532recv_sndq:
2533 err = tcp_peek_sndq(sk, msg, len);
2534 goto out;
2535}
2536
2537int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
2538 int flags, int *addr_len)
2539{
2540 int cmsg_flags = 0, ret, inq;
2541 struct scm_timestamping_internal tss;
2542
2543 if (unlikely(flags & MSG_ERRQUEUE))
2544 return inet_recv_error(sk, msg, len, addr_len);
2545
2546 if (sk_can_busy_loop(sk) &&
2547 skb_queue_empty_lockless(&sk->sk_receive_queue) &&
2548 sk->sk_state == TCP_ESTABLISHED)
2549 sk_busy_loop(sk, nonblock);
2550
2551 lock_sock(sk);
2552 ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss,
2553 &cmsg_flags);
2554 release_sock(sk);
2555
2556 if (cmsg_flags && ret >= 0) {
2557 if (cmsg_flags & TCP_CMSG_TS)
2558 tcp_recv_timestamp(msg, sk, &tss);
2559 if (cmsg_flags & TCP_CMSG_INQ) {
2560 inq = tcp_inq_hint(sk);
2561 put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
2562 }
2563 }
2564 return ret;
2565}
2566EXPORT_SYMBOL(tcp_recvmsg);
2567
2568void tcp_set_state(struct sock *sk, int state)
2569{
2570 int oldstate = sk->sk_state;
2571
2572
2573
2574
2575
2576
2577
2578
2579 BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
2580 BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
2581 BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
2582 BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
2583 BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
2584 BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
2585 BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
2586 BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
2587 BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
2588 BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
2589 BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
2590 BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
2591 BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602 BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED);
2603
2604 if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
2605 tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
2606
2607 switch (state) {
2608 case TCP_ESTABLISHED:
2609 if (oldstate != TCP_ESTABLISHED)
2610 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2611 break;
2612
2613 case TCP_CLOSE:
2614 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
2615 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
2616
2617 sk->sk_prot->unhash(sk);
2618 if (inet_csk(sk)->icsk_bind_hash &&
2619 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
2620 inet_put_port(sk);
2621 fallthrough;
2622 default:
2623 if (oldstate == TCP_ESTABLISHED)
2624 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2625 }
2626
2627
2628
2629
2630 inet_sk_state_store(sk, state);
2631}
2632EXPORT_SYMBOL_GPL(tcp_set_state);
2633
2634
2635
2636
2637
2638
2639
2640
2641static const unsigned char new_state[16] = {
2642
2643 [0 ] = TCP_CLOSE,
2644 [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2645 [TCP_SYN_SENT] = TCP_CLOSE,
2646 [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2647 [TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
2648 [TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
2649 [TCP_TIME_WAIT] = TCP_CLOSE,
2650 [TCP_CLOSE] = TCP_CLOSE,
2651 [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
2652 [TCP_LAST_ACK] = TCP_LAST_ACK,
2653 [TCP_LISTEN] = TCP_CLOSE,
2654 [TCP_CLOSING] = TCP_CLOSING,
2655 [TCP_NEW_SYN_RECV] = TCP_CLOSE,
2656};
2657
2658static int tcp_close_state(struct sock *sk)
2659{
2660 int next = (int)new_state[sk->sk_state];
2661 int ns = next & TCP_STATE_MASK;
2662
2663 tcp_set_state(sk, ns);
2664
2665 return next & TCP_ACTION_FIN;
2666}
2667
2668
2669
2670
2671
2672
2673void tcp_shutdown(struct sock *sk, int how)
2674{
2675
2676
2677
2678
2679 if (!(how & SEND_SHUTDOWN))
2680 return;
2681
2682
2683 if ((1 << sk->sk_state) &
2684 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2685 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2686
2687 if (tcp_close_state(sk))
2688 tcp_send_fin(sk);
2689 }
2690}
2691EXPORT_SYMBOL(tcp_shutdown);
2692
2693bool tcp_check_oom(struct sock *sk, int shift)
2694{
2695 bool too_many_orphans, out_of_socket_memory;
2696
2697 too_many_orphans = tcp_too_many_orphans(sk, shift);
2698 out_of_socket_memory = tcp_out_of_memory(sk);
2699
2700 if (too_many_orphans)
2701 net_info_ratelimited("too many orphaned sockets\n");
2702 if (out_of_socket_memory)
2703 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2704 return too_many_orphans || out_of_socket_memory;
2705}
2706
2707void __tcp_close(struct sock *sk, long timeout)
2708{
2709 struct sk_buff *skb;
2710 int data_was_unread = 0;
2711 int state;
2712
2713 sk->sk_shutdown = SHUTDOWN_MASK;
2714
2715 if (sk->sk_state == TCP_LISTEN) {
2716 tcp_set_state(sk, TCP_CLOSE);
2717
2718
2719 inet_csk_listen_stop(sk);
2720
2721 goto adjudge_to_death;
2722 }
2723
2724
2725
2726
2727
2728 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2729 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
2730
2731 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2732 len--;
2733 data_was_unread += len;
2734 __kfree_skb(skb);
2735 }
2736
2737 sk_mem_reclaim(sk);
2738
2739
2740 if (sk->sk_state == TCP_CLOSE)
2741 goto adjudge_to_death;
2742
2743
2744
2745
2746
2747
2748
2749
2750 if (unlikely(tcp_sk(sk)->repair)) {
2751 sk->sk_prot->disconnect(sk, 0);
2752 } else if (data_was_unread) {
2753
2754 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2755 tcp_set_state(sk, TCP_CLOSE);
2756 tcp_send_active_reset(sk, sk->sk_allocation);
2757 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2758
2759 sk->sk_prot->disconnect(sk, 0);
2760 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2761 } else if (tcp_close_state(sk)) {
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791 tcp_send_fin(sk);
2792 }
2793
2794 sk_stream_wait_close(sk, timeout);
2795
2796adjudge_to_death:
2797 state = sk->sk_state;
2798 sock_hold(sk);
2799 sock_orphan(sk);
2800
2801 local_bh_disable();
2802 bh_lock_sock(sk);
2803
2804 __release_sock(sk);
2805
2806 percpu_counter_inc(sk->sk_prot->orphan_count);
2807
2808
2809 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2810 goto out;
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826 if (sk->sk_state == TCP_FIN_WAIT2) {
2827 struct tcp_sock *tp = tcp_sk(sk);
2828 if (tp->linger2 < 0) {
2829 tcp_set_state(sk, TCP_CLOSE);
2830 tcp_send_active_reset(sk, GFP_ATOMIC);
2831 __NET_INC_STATS(sock_net(sk),
2832 LINUX_MIB_TCPABORTONLINGER);
2833 } else {
2834 const int tmo = tcp_fin_time(sk);
2835
2836 if (tmo > TCP_TIMEWAIT_LEN) {
2837 inet_csk_reset_keepalive_timer(sk,
2838 tmo - TCP_TIMEWAIT_LEN);
2839 } else {
2840 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2841 goto out;
2842 }
2843 }
2844 }
2845 if (sk->sk_state != TCP_CLOSE) {
2846 sk_mem_reclaim(sk);
2847 if (tcp_check_oom(sk, 0)) {
2848 tcp_set_state(sk, TCP_CLOSE);
2849 tcp_send_active_reset(sk, GFP_ATOMIC);
2850 __NET_INC_STATS(sock_net(sk),
2851 LINUX_MIB_TCPABORTONMEMORY);
2852 } else if (!check_net(sock_net(sk))) {
2853
2854 tcp_set_state(sk, TCP_CLOSE);
2855 }
2856 }
2857
2858 if (sk->sk_state == TCP_CLOSE) {
2859 struct request_sock *req;
2860
2861 req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
2862 lockdep_sock_is_held(sk));
2863
2864
2865
2866
2867 if (req)
2868 reqsk_fastopen_remove(sk, req, false);
2869 inet_csk_destroy_sock(sk);
2870 }
2871
2872
2873out:
2874 bh_unlock_sock(sk);
2875 local_bh_enable();
2876}
2877
2878void tcp_close(struct sock *sk, long timeout)
2879{
2880 lock_sock(sk);
2881 __tcp_close(sk, timeout);
2882 release_sock(sk);
2883 sock_put(sk);
2884}
2885EXPORT_SYMBOL(tcp_close);
2886
2887
2888
2889static inline bool tcp_need_reset(int state)
2890{
2891 return (1 << state) &
2892 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2893 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2894}
2895
2896static void tcp_rtx_queue_purge(struct sock *sk)
2897{
2898 struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
2899
2900 tcp_sk(sk)->highest_sack = NULL;
2901 while (p) {
2902 struct sk_buff *skb = rb_to_skb(p);
2903
2904 p = rb_next(p);
2905
2906
2907
2908 tcp_rtx_queue_unlink(skb, sk);
2909 sk_wmem_free_skb(sk, skb);
2910 }
2911}
2912
2913void tcp_write_queue_purge(struct sock *sk)
2914{
2915 struct sk_buff *skb;
2916
2917 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
2918 while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
2919 tcp_skb_tsorted_anchor_cleanup(skb);
2920 sk_wmem_free_skb(sk, skb);
2921 }
2922 tcp_rtx_queue_purge(sk);
2923 skb = sk->sk_tx_skb_cache;
2924 if (skb) {
2925 __kfree_skb(skb);
2926 sk->sk_tx_skb_cache = NULL;
2927 }
2928 INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
2929 sk_mem_reclaim(sk);
2930 tcp_clear_all_retrans_hints(tcp_sk(sk));
2931 tcp_sk(sk)->packets_out = 0;
2932 inet_csk(sk)->icsk_backoff = 0;
2933}
2934
2935int tcp_disconnect(struct sock *sk, int flags)
2936{
2937 struct inet_sock *inet = inet_sk(sk);
2938 struct inet_connection_sock *icsk = inet_csk(sk);
2939 struct tcp_sock *tp = tcp_sk(sk);
2940 int old_state = sk->sk_state;
2941 u32 seq;
2942
2943 if (old_state != TCP_CLOSE)
2944 tcp_set_state(sk, TCP_CLOSE);
2945
2946
2947 if (old_state == TCP_LISTEN) {
2948 inet_csk_listen_stop(sk);
2949 } else if (unlikely(tp->repair)) {
2950 sk->sk_err = ECONNABORTED;
2951 } else if (tcp_need_reset(old_state) ||
2952 (tp->snd_nxt != tp->write_seq &&
2953 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2954
2955
2956
2957 tcp_send_active_reset(sk, gfp_any());
2958 sk->sk_err = ECONNRESET;
2959 } else if (old_state == TCP_SYN_SENT)
2960 sk->sk_err = ECONNRESET;
2961
2962 tcp_clear_xmit_timers(sk);
2963 __skb_queue_purge(&sk->sk_receive_queue);
2964 if (sk->sk_rx_skb_cache) {
2965 __kfree_skb(sk->sk_rx_skb_cache);
2966 sk->sk_rx_skb_cache = NULL;
2967 }
2968 WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
2969 tp->urg_data = 0;
2970 tcp_write_queue_purge(sk);
2971 tcp_fastopen_active_disable_ofo_check(sk);
2972 skb_rbtree_purge(&tp->out_of_order_queue);
2973
2974 inet->inet_dport = 0;
2975
2976 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2977 inet_reset_saddr(sk);
2978
2979 sk->sk_shutdown = 0;
2980 sock_reset_flag(sk, SOCK_DONE);
2981 tp->srtt_us = 0;
2982 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
2983 tp->rcv_rtt_last_tsecr = 0;
2984
2985 seq = tp->write_seq + tp->max_window + 2;
2986 if (!seq)
2987 seq = 1;
2988 WRITE_ONCE(tp->write_seq, seq);
2989
2990 icsk->icsk_backoff = 0;
2991 icsk->icsk_probes_out = 0;
2992 icsk->icsk_probes_tstamp = 0;
2993 icsk->icsk_rto = TCP_TIMEOUT_INIT;
2994 icsk->icsk_rto_min = TCP_RTO_MIN;
2995 icsk->icsk_delack_max = TCP_DELACK_MAX;
2996 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2997 tp->snd_cwnd = TCP_INIT_CWND;
2998 tp->snd_cwnd_cnt = 0;
2999 tp->window_clamp = 0;
3000 tp->delivered = 0;
3001 tp->delivered_ce = 0;
3002 if (icsk->icsk_ca_ops->release)
3003 icsk->icsk_ca_ops->release(sk);
3004 memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
3005 icsk->icsk_ca_initialized = 0;
3006 tcp_set_ca_state(sk, TCP_CA_Open);
3007 tp->is_sack_reneg = 0;
3008 tcp_clear_retrans(tp);
3009 tp->total_retrans = 0;
3010 inet_csk_delack_init(sk);
3011
3012
3013
3014 icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
3015 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
3016 __sk_dst_reset(sk);
3017 dst_release(sk->sk_rx_dst);
3018 sk->sk_rx_dst = NULL;
3019 tcp_saved_syn_free(tp);
3020 tp->compressed_ack = 0;
3021 tp->segs_in = 0;
3022 tp->segs_out = 0;
3023 tp->bytes_sent = 0;
3024 tp->bytes_acked = 0;
3025 tp->bytes_received = 0;
3026 tp->bytes_retrans = 0;
3027 tp->data_segs_in = 0;
3028 tp->data_segs_out = 0;
3029 tp->duplicate_sack[0].start_seq = 0;
3030 tp->duplicate_sack[0].end_seq = 0;
3031 tp->dsack_dups = 0;
3032 tp->reord_seen = 0;
3033 tp->retrans_out = 0;
3034 tp->sacked_out = 0;
3035 tp->tlp_high_seq = 0;
3036 tp->last_oow_ack_time = 0;
3037
3038 tp->app_limited = ~0U;
3039 tp->rack.mstamp = 0;
3040 tp->rack.advanced = 0;
3041 tp->rack.reo_wnd_steps = 1;
3042 tp->rack.last_delivered = 0;
3043 tp->rack.reo_wnd_persist = 0;
3044 tp->rack.dsack_seen = 0;
3045 tp->syn_data_acked = 0;
3046 tp->rx_opt.saw_tstamp = 0;
3047 tp->rx_opt.dsack = 0;
3048 tp->rx_opt.num_sacks = 0;
3049 tp->rcv_ooopack = 0;
3050
3051
3052
3053 tcp_free_fastopen_req(tp);
3054 inet->defer_connect = 0;
3055 tp->fastopen_client_fail = 0;
3056
3057 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
3058
3059 if (sk->sk_frag.page) {
3060 put_page(sk->sk_frag.page);
3061 sk->sk_frag.page = NULL;
3062 sk->sk_frag.offset = 0;
3063 }
3064
3065 sk_error_report(sk);
3066 return 0;
3067}
3068EXPORT_SYMBOL(tcp_disconnect);
3069
3070static inline bool tcp_can_repair_sock(const struct sock *sk)
3071{
3072 return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
3073 (sk->sk_state != TCP_LISTEN);
3074}
3075
3076static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
3077{
3078 struct tcp_repair_window opt;
3079
3080 if (!tp->repair)
3081 return -EPERM;
3082
3083 if (len != sizeof(opt))
3084 return -EINVAL;
3085
3086 if (copy_from_sockptr(&opt, optbuf, sizeof(opt)))
3087 return -EFAULT;
3088
3089 if (opt.max_window < opt.snd_wnd)
3090 return -EINVAL;
3091
3092 if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
3093 return -EINVAL;
3094
3095 if (after(opt.rcv_wup, tp->rcv_nxt))
3096 return -EINVAL;
3097
3098 tp->snd_wl1 = opt.snd_wl1;
3099 tp->snd_wnd = opt.snd_wnd;
3100 tp->max_window = opt.max_window;
3101
3102 tp->rcv_wnd = opt.rcv_wnd;
3103 tp->rcv_wup = opt.rcv_wup;
3104
3105 return 0;
3106}
3107
3108static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
3109 unsigned int len)
3110{
3111 struct tcp_sock *tp = tcp_sk(sk);
3112 struct tcp_repair_opt opt;
3113 size_t offset = 0;
3114
3115 while (len >= sizeof(opt)) {
3116 if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt)))
3117 return -EFAULT;
3118
3119 offset += sizeof(opt);
3120 len -= sizeof(opt);
3121
3122 switch (opt.opt_code) {
3123 case TCPOPT_MSS:
3124 tp->rx_opt.mss_clamp = opt.opt_val;
3125 tcp_mtup_init(sk);
3126 break;
3127 case TCPOPT_WINDOW:
3128 {
3129 u16 snd_wscale = opt.opt_val & 0xFFFF;
3130 u16 rcv_wscale = opt.opt_val >> 16;
3131
3132 if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE)
3133 return -EFBIG;
3134
3135 tp->rx_opt.snd_wscale = snd_wscale;
3136 tp->rx_opt.rcv_wscale = rcv_wscale;
3137 tp->rx_opt.wscale_ok = 1;
3138 }
3139 break;
3140 case TCPOPT_SACK_PERM:
3141 if (opt.opt_val != 0)
3142 return -EINVAL;
3143
3144 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
3145 break;
3146 case TCPOPT_TIMESTAMP:
3147 if (opt.opt_val != 0)
3148 return -EINVAL;
3149
3150 tp->rx_opt.tstamp_ok = 1;
3151 break;
3152 }
3153 }
3154
3155 return 0;
3156}
3157
3158DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
3159EXPORT_SYMBOL(tcp_tx_delay_enabled);
3160
3161static void tcp_enable_tx_delay(void)
3162{
3163 if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
3164 static int __tcp_tx_delay_enabled = 0;
3165
3166 if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
3167 static_branch_enable(&tcp_tx_delay_enabled);
3168 pr_info("TCP_TX_DELAY enabled\n");
3169 }
3170 }
3171}
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182static void __tcp_sock_set_cork(struct sock *sk, bool on)
3183{
3184 struct tcp_sock *tp = tcp_sk(sk);
3185
3186 if (on) {
3187 tp->nonagle |= TCP_NAGLE_CORK;
3188 } else {
3189 tp->nonagle &= ~TCP_NAGLE_CORK;
3190 if (tp->nonagle & TCP_NAGLE_OFF)
3191 tp->nonagle |= TCP_NAGLE_PUSH;
3192 tcp_push_pending_frames(sk);
3193 }
3194}
3195
3196void tcp_sock_set_cork(struct sock *sk, bool on)
3197{
3198 lock_sock(sk);
3199 __tcp_sock_set_cork(sk, on);
3200 release_sock(sk);
3201}
3202EXPORT_SYMBOL(tcp_sock_set_cork);
3203
3204
3205
3206
3207
3208
3209
3210static void __tcp_sock_set_nodelay(struct sock *sk, bool on)
3211{
3212 if (on) {
3213 tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
3214 tcp_push_pending_frames(sk);
3215 } else {
3216 tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF;
3217 }
3218}
3219
3220void tcp_sock_set_nodelay(struct sock *sk)
3221{
3222 lock_sock(sk);
3223 __tcp_sock_set_nodelay(sk, true);
3224 release_sock(sk);
3225}
3226EXPORT_SYMBOL(tcp_sock_set_nodelay);
3227
3228static void __tcp_sock_set_quickack(struct sock *sk, int val)
3229{
3230 if (!val) {
3231 inet_csk_enter_pingpong_mode(sk);
3232 return;
3233 }
3234
3235 inet_csk_exit_pingpong_mode(sk);
3236 if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
3237 inet_csk_ack_scheduled(sk)) {
3238 inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED;
3239 tcp_cleanup_rbuf(sk, 1);
3240 if (!(val & 1))
3241 inet_csk_enter_pingpong_mode(sk);
3242 }
3243}
3244
3245void tcp_sock_set_quickack(struct sock *sk, int val)
3246{
3247 lock_sock(sk);
3248 __tcp_sock_set_quickack(sk, val);
3249 release_sock(sk);
3250}
3251EXPORT_SYMBOL(tcp_sock_set_quickack);
3252
3253int tcp_sock_set_syncnt(struct sock *sk, int val)
3254{
3255 if (val < 1 || val > MAX_TCP_SYNCNT)
3256 return -EINVAL;
3257
3258 lock_sock(sk);
3259 inet_csk(sk)->icsk_syn_retries = val;
3260 release_sock(sk);
3261 return 0;
3262}
3263EXPORT_SYMBOL(tcp_sock_set_syncnt);
3264
3265void tcp_sock_set_user_timeout(struct sock *sk, u32 val)
3266{
3267 lock_sock(sk);
3268 inet_csk(sk)->icsk_user_timeout = val;
3269 release_sock(sk);
3270}
3271EXPORT_SYMBOL(tcp_sock_set_user_timeout);
3272
3273int tcp_sock_set_keepidle_locked(struct sock *sk, int val)
3274{
3275 struct tcp_sock *tp = tcp_sk(sk);
3276
3277 if (val < 1 || val > MAX_TCP_KEEPIDLE)
3278 return -EINVAL;
3279
3280 tp->keepalive_time = val * HZ;
3281 if (sock_flag(sk, SOCK_KEEPOPEN) &&
3282 !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
3283 u32 elapsed = keepalive_time_elapsed(tp);
3284
3285 if (tp->keepalive_time > elapsed)
3286 elapsed = tp->keepalive_time - elapsed;
3287 else
3288 elapsed = 0;
3289 inet_csk_reset_keepalive_timer(sk, elapsed);
3290 }
3291
3292 return 0;
3293}
3294
3295int tcp_sock_set_keepidle(struct sock *sk, int val)
3296{
3297 int err;
3298
3299 lock_sock(sk);
3300 err = tcp_sock_set_keepidle_locked(sk, val);
3301 release_sock(sk);
3302 return err;
3303}
3304EXPORT_SYMBOL(tcp_sock_set_keepidle);
3305
3306int tcp_sock_set_keepintvl(struct sock *sk, int val)
3307{
3308 if (val < 1 || val > MAX_TCP_KEEPINTVL)
3309 return -EINVAL;
3310
3311 lock_sock(sk);
3312 tcp_sk(sk)->keepalive_intvl = val * HZ;
3313 release_sock(sk);
3314 return 0;
3315}
3316EXPORT_SYMBOL(tcp_sock_set_keepintvl);
3317
3318int tcp_sock_set_keepcnt(struct sock *sk, int val)
3319{
3320 if (val < 1 || val > MAX_TCP_KEEPCNT)
3321 return -EINVAL;
3322
3323 lock_sock(sk);
3324 tcp_sk(sk)->keepalive_probes = val;
3325 release_sock(sk);
3326 return 0;
3327}
3328EXPORT_SYMBOL(tcp_sock_set_keepcnt);
3329
3330int tcp_set_window_clamp(struct sock *sk, int val)
3331{
3332 struct tcp_sock *tp = tcp_sk(sk);
3333
3334 if (!val) {
3335 if (sk->sk_state != TCP_CLOSE)
3336 return -EINVAL;
3337 tp->window_clamp = 0;
3338 } else {
3339 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
3340 SOCK_MIN_RCVBUF / 2 : val;
3341 }
3342 return 0;
3343}
3344
3345
3346
3347
3348static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
3349 sockptr_t optval, unsigned int optlen)
3350{
3351 struct tcp_sock *tp = tcp_sk(sk);
3352 struct inet_connection_sock *icsk = inet_csk(sk);
3353 struct net *net = sock_net(sk);
3354 int val;
3355 int err = 0;
3356
3357
3358 switch (optname) {
3359 case TCP_CONGESTION: {
3360 char name[TCP_CA_NAME_MAX];
3361
3362 if (optlen < 1)
3363 return -EINVAL;
3364
3365 val = strncpy_from_sockptr(name, optval,
3366 min_t(long, TCP_CA_NAME_MAX-1, optlen));
3367 if (val < 0)
3368 return -EFAULT;
3369 name[val] = 0;
3370
3371 lock_sock(sk);
3372 err = tcp_set_congestion_control(sk, name, true,
3373 ns_capable(sock_net(sk)->user_ns,
3374 CAP_NET_ADMIN));
3375 release_sock(sk);
3376 return err;
3377 }
3378 case TCP_ULP: {
3379 char name[TCP_ULP_NAME_MAX];
3380
3381 if (optlen < 1)
3382 return -EINVAL;
3383
3384 val = strncpy_from_sockptr(name, optval,
3385 min_t(long, TCP_ULP_NAME_MAX - 1,
3386 optlen));
3387 if (val < 0)
3388 return -EFAULT;
3389 name[val] = 0;
3390
3391 lock_sock(sk);
3392 err = tcp_set_ulp(sk, name);
3393 release_sock(sk);
3394 return err;
3395 }
3396 case TCP_FASTOPEN_KEY: {
3397 __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
3398 __u8 *backup_key = NULL;
3399
3400
3401
3402
3403 if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
3404 optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
3405 return -EINVAL;
3406
3407 if (copy_from_sockptr(key, optval, optlen))
3408 return -EFAULT;
3409
3410 if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
3411 backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
3412
3413 return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
3414 }
3415 default:
3416
3417 break;
3418 }
3419
3420 if (optlen < sizeof(int))
3421 return -EINVAL;
3422
3423 if (copy_from_sockptr(&val, optval, sizeof(val)))
3424 return -EFAULT;
3425
3426 lock_sock(sk);
3427
3428 switch (optname) {
3429 case TCP_MAXSEG:
3430
3431
3432
3433
3434 if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) {
3435 err = -EINVAL;
3436 break;
3437 }
3438 tp->rx_opt.user_mss = val;
3439 break;
3440
3441 case TCP_NODELAY:
3442 __tcp_sock_set_nodelay(sk, val);
3443 break;
3444
3445 case TCP_THIN_LINEAR_TIMEOUTS:
3446 if (val < 0 || val > 1)
3447 err = -EINVAL;
3448 else
3449 tp->thin_lto = val;
3450 break;
3451
3452 case TCP_THIN_DUPACK:
3453 if (val < 0 || val > 1)
3454 err = -EINVAL;
3455 break;
3456
3457 case TCP_REPAIR:
3458 if (!tcp_can_repair_sock(sk))
3459 err = -EPERM;
3460 else if (val == TCP_REPAIR_ON) {
3461 tp->repair = 1;
3462 sk->sk_reuse = SK_FORCE_REUSE;
3463 tp->repair_queue = TCP_NO_QUEUE;
3464 } else if (val == TCP_REPAIR_OFF) {
3465 tp->repair = 0;
3466 sk->sk_reuse = SK_NO_REUSE;
3467 tcp_send_window_probe(sk);
3468 } else if (val == TCP_REPAIR_OFF_NO_WP) {
3469 tp->repair = 0;
3470 sk->sk_reuse = SK_NO_REUSE;
3471 } else
3472 err = -EINVAL;
3473
3474 break;
3475
3476 case TCP_REPAIR_QUEUE:
3477 if (!tp->repair)
3478 err = -EPERM;
3479 else if ((unsigned int)val < TCP_QUEUES_NR)
3480 tp->repair_queue = val;
3481 else
3482 err = -EINVAL;
3483 break;
3484
3485 case TCP_QUEUE_SEQ:
3486 if (sk->sk_state != TCP_CLOSE) {
3487 err = -EPERM;
3488 } else if (tp->repair_queue == TCP_SEND_QUEUE) {
3489 if (!tcp_rtx_queue_empty(sk))
3490 err = -EPERM;
3491 else
3492 WRITE_ONCE(tp->write_seq, val);
3493 } else if (tp->repair_queue == TCP_RECV_QUEUE) {
3494 if (tp->rcv_nxt != tp->copied_seq) {
3495 err = -EPERM;
3496 } else {
3497 WRITE_ONCE(tp->rcv_nxt, val);
3498 WRITE_ONCE(tp->copied_seq, val);
3499 }
3500 } else {
3501 err = -EINVAL;
3502 }
3503 break;
3504
3505 case TCP_REPAIR_OPTIONS:
3506 if (!tp->repair)
3507 err = -EINVAL;
3508 else if (sk->sk_state == TCP_ESTABLISHED)
3509 err = tcp_repair_options_est(sk, optval, optlen);
3510 else
3511 err = -EPERM;
3512 break;
3513
3514 case TCP_CORK:
3515 __tcp_sock_set_cork(sk, val);
3516 break;
3517
3518 case TCP_KEEPIDLE:
3519 err = tcp_sock_set_keepidle_locked(sk, val);
3520 break;
3521 case TCP_KEEPINTVL:
3522 if (val < 1 || val > MAX_TCP_KEEPINTVL)
3523 err = -EINVAL;
3524 else
3525 tp->keepalive_intvl = val * HZ;
3526 break;
3527 case TCP_KEEPCNT:
3528 if (val < 1 || val > MAX_TCP_KEEPCNT)
3529 err = -EINVAL;
3530 else
3531 tp->keepalive_probes = val;
3532 break;
3533 case TCP_SYNCNT:
3534 if (val < 1 || val > MAX_TCP_SYNCNT)
3535 err = -EINVAL;
3536 else
3537 icsk->icsk_syn_retries = val;
3538 break;
3539
3540 case TCP_SAVE_SYN:
3541
3542 if (val < 0 || val > 2)
3543 err = -EINVAL;
3544 else
3545 tp->save_syn = val;
3546 break;
3547
3548 case TCP_LINGER2:
3549 if (val < 0)
3550 tp->linger2 = -1;
3551 else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
3552 tp->linger2 = TCP_FIN_TIMEOUT_MAX;
3553 else
3554 tp->linger2 = val * HZ;
3555 break;
3556
3557 case TCP_DEFER_ACCEPT:
3558
3559 icsk->icsk_accept_queue.rskq_defer_accept =
3560 secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
3561 TCP_RTO_MAX / HZ);
3562 break;
3563
3564 case TCP_WINDOW_CLAMP:
3565 err = tcp_set_window_clamp(sk, val);
3566 break;
3567
3568 case TCP_QUICKACK:
3569 __tcp_sock_set_quickack(sk, val);
3570 break;
3571
3572#ifdef CONFIG_TCP_MD5SIG
3573 case TCP_MD5SIG:
3574 case TCP_MD5SIG_EXT:
3575 err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
3576 break;
3577#endif
3578 case TCP_USER_TIMEOUT:
3579
3580
3581
3582 if (val < 0)
3583 err = -EINVAL;
3584 else
3585 icsk->icsk_user_timeout = val;
3586 break;
3587
3588 case TCP_FASTOPEN:
3589 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
3590 TCPF_LISTEN))) {
3591 tcp_fastopen_init_key_once(net);
3592
3593 fastopen_queue_tune(sk, val);
3594 } else {
3595 err = -EINVAL;
3596 }
3597 break;
3598 case TCP_FASTOPEN_CONNECT:
3599 if (val > 1 || val < 0) {
3600 err = -EINVAL;
3601 } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
3602 if (sk->sk_state == TCP_CLOSE)
3603 tp->fastopen_connect = val;
3604 else
3605 err = -EINVAL;
3606 } else {
3607 err = -EOPNOTSUPP;
3608 }
3609 break;
3610 case TCP_FASTOPEN_NO_COOKIE:
3611 if (val > 1 || val < 0)
3612 err = -EINVAL;
3613 else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3614 err = -EINVAL;
3615 else
3616 tp->fastopen_no_cookie = val;
3617 break;
3618 case TCP_TIMESTAMP:
3619 if (!tp->repair)
3620 err = -EPERM;
3621 else
3622 tp->tsoffset = val - tcp_time_stamp_raw();
3623 break;
3624 case TCP_REPAIR_WINDOW:
3625 err = tcp_repair_set_window(tp, optval, optlen);
3626 break;
3627 case TCP_NOTSENT_LOWAT:
3628 tp->notsent_lowat = val;
3629 sk->sk_write_space(sk);
3630 break;
3631 case TCP_INQ:
3632 if (val > 1 || val < 0)
3633 err = -EINVAL;
3634 else
3635 tp->recvmsg_inq = val;
3636 break;
3637 case TCP_TX_DELAY:
3638 if (val)
3639 tcp_enable_tx_delay();
3640 tp->tcp_tx_delay = val;
3641 break;
3642 default:
3643 err = -ENOPROTOOPT;
3644 break;
3645 }
3646
3647 release_sock(sk);
3648 return err;
3649}
3650
3651int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
3652 unsigned int optlen)
3653{
3654 const struct inet_connection_sock *icsk = inet_csk(sk);
3655
3656 if (level != SOL_TCP)
3657 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
3658 optval, optlen);
3659 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
3660}
3661EXPORT_SYMBOL(tcp_setsockopt);
3662
3663static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
3664 struct tcp_info *info)
3665{
3666 u64 stats[__TCP_CHRONO_MAX], total = 0;
3667 enum tcp_chrono i;
3668
3669 for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
3670 stats[i] = tp->chrono_stat[i - 1];
3671 if (i == tp->chrono_type)
3672 stats[i] += tcp_jiffies32 - tp->chrono_start;
3673 stats[i] *= USEC_PER_SEC / HZ;
3674 total += stats[i];
3675 }
3676
3677 info->tcpi_busy_time = total;
3678 info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
3679 info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
3680}
3681
3682
3683void tcp_get_info(struct sock *sk, struct tcp_info *info)
3684{
3685 const struct tcp_sock *tp = tcp_sk(sk);
3686 const struct inet_connection_sock *icsk = inet_csk(sk);
3687 unsigned long rate;
3688 u32 now;
3689 u64 rate64;
3690 bool slow;
3691
3692 memset(info, 0, sizeof(*info));
3693 if (sk->sk_type != SOCK_STREAM)
3694 return;
3695
3696 info->tcpi_state = inet_sk_state_load(sk);
3697
3698
3699 rate = READ_ONCE(sk->sk_pacing_rate);
3700 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3701 info->tcpi_pacing_rate = rate64;
3702
3703 rate = READ_ONCE(sk->sk_max_pacing_rate);
3704 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3705 info->tcpi_max_pacing_rate = rate64;
3706
3707 info->tcpi_reordering = tp->reordering;
3708 info->tcpi_snd_cwnd = tp->snd_cwnd;
3709
3710 if (info->tcpi_state == TCP_LISTEN) {
3711
3712
3713
3714
3715 info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
3716 info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog);
3717 return;
3718 }
3719
3720 slow = lock_sock_fast(sk);
3721
3722 info->tcpi_ca_state = icsk->icsk_ca_state;
3723 info->tcpi_retransmits = icsk->icsk_retransmits;
3724 info->tcpi_probes = icsk->icsk_probes_out;
3725 info->tcpi_backoff = icsk->icsk_backoff;
3726
3727 if (tp->rx_opt.tstamp_ok)
3728 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
3729 if (tcp_is_sack(tp))
3730 info->tcpi_options |= TCPI_OPT_SACK;
3731 if (tp->rx_opt.wscale_ok) {
3732 info->tcpi_options |= TCPI_OPT_WSCALE;
3733 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
3734 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
3735 }
3736
3737 if (tp->ecn_flags & TCP_ECN_OK)
3738 info->tcpi_options |= TCPI_OPT_ECN;
3739 if (tp->ecn_flags & TCP_ECN_SEEN)
3740 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
3741 if (tp->syn_data_acked)
3742 info->tcpi_options |= TCPI_OPT_SYN_DATA;
3743
3744 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
3745 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
3746 info->tcpi_snd_mss = tp->mss_cache;
3747 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
3748
3749 info->tcpi_unacked = tp->packets_out;
3750 info->tcpi_sacked = tp->sacked_out;
3751
3752 info->tcpi_lost = tp->lost_out;
3753 info->tcpi_retrans = tp->retrans_out;
3754
3755 now = tcp_jiffies32;
3756 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
3757 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
3758 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
3759
3760 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
3761 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
3762 info->tcpi_rtt = tp->srtt_us >> 3;
3763 info->tcpi_rttvar = tp->mdev_us >> 2;
3764 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
3765 info->tcpi_advmss = tp->advmss;
3766
3767 info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
3768 info->tcpi_rcv_space = tp->rcvq_space.space;
3769
3770 info->tcpi_total_retrans = tp->total_retrans;
3771
3772 info->tcpi_bytes_acked = tp->bytes_acked;
3773 info->tcpi_bytes_received = tp->bytes_received;
3774 info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
3775 tcp_get_info_chrono_stats(tp, info);
3776
3777 info->tcpi_segs_out = tp->segs_out;
3778 info->tcpi_segs_in = tp->segs_in;
3779
3780 info->tcpi_min_rtt = tcp_min_rtt(tp);
3781 info->tcpi_data_segs_in = tp->data_segs_in;
3782 info->tcpi_data_segs_out = tp->data_segs_out;
3783
3784 info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
3785 rate64 = tcp_compute_delivery_rate(tp);
3786 if (rate64)
3787 info->tcpi_delivery_rate = rate64;
3788 info->tcpi_delivered = tp->delivered;
3789 info->tcpi_delivered_ce = tp->delivered_ce;
3790 info->tcpi_bytes_sent = tp->bytes_sent;
3791 info->tcpi_bytes_retrans = tp->bytes_retrans;
3792 info->tcpi_dsack_dups = tp->dsack_dups;
3793 info->tcpi_reord_seen = tp->reord_seen;
3794 info->tcpi_rcv_ooopack = tp->rcv_ooopack;
3795 info->tcpi_snd_wnd = tp->snd_wnd;
3796 info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
3797 unlock_sock_fast(sk, slow);
3798}
3799EXPORT_SYMBOL_GPL(tcp_get_info);
3800
3801static size_t tcp_opt_stats_get_size(void)
3802{
3803 return
3804 nla_total_size_64bit(sizeof(u64)) +
3805 nla_total_size_64bit(sizeof(u64)) +
3806 nla_total_size_64bit(sizeof(u64)) +
3807 nla_total_size_64bit(sizeof(u64)) +
3808 nla_total_size_64bit(sizeof(u64)) +
3809 nla_total_size_64bit(sizeof(u64)) +
3810 nla_total_size_64bit(sizeof(u64)) +
3811 nla_total_size(sizeof(u32)) +
3812 nla_total_size(sizeof(u32)) +
3813 nla_total_size(sizeof(u32)) +
3814 nla_total_size(sizeof(u8)) +
3815 nla_total_size(sizeof(u8)) +
3816 nla_total_size(sizeof(u32)) +
3817 nla_total_size(sizeof(u8)) +
3818 nla_total_size(sizeof(u32)) +
3819 nla_total_size(sizeof(u32)) +
3820 nla_total_size(sizeof(u32)) +
3821 nla_total_size_64bit(sizeof(u64)) +
3822 nla_total_size_64bit(sizeof(u64)) +
3823 nla_total_size(sizeof(u32)) +
3824 nla_total_size(sizeof(u32)) +
3825 nla_total_size(sizeof(u32)) +
3826 nla_total_size(sizeof(u16)) +
3827 nla_total_size(sizeof(u32)) +
3828 nla_total_size_64bit(sizeof(u64)) +
3829 nla_total_size(sizeof(u8)) +
3830 0;
3831}
3832
3833
3834static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb)
3835{
3836 if (skb->protocol == htons(ETH_P_IP))
3837 return ip_hdr(skb)->ttl;
3838 else if (skb->protocol == htons(ETH_P_IPV6))
3839 return ipv6_hdr(skb)->hop_limit;
3840 else
3841 return 0;
3842}
3843
3844struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
3845 const struct sk_buff *orig_skb,
3846 const struct sk_buff *ack_skb)
3847{
3848 const struct tcp_sock *tp = tcp_sk(sk);
3849 struct sk_buff *stats;
3850 struct tcp_info info;
3851 unsigned long rate;
3852 u64 rate64;
3853
3854 stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
3855 if (!stats)
3856 return NULL;
3857
3858 tcp_get_info_chrono_stats(tp, &info);
3859 nla_put_u64_64bit(stats, TCP_NLA_BUSY,
3860 info.tcpi_busy_time, TCP_NLA_PAD);
3861 nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
3862 info.tcpi_rwnd_limited, TCP_NLA_PAD);
3863 nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
3864 info.tcpi_sndbuf_limited, TCP_NLA_PAD);
3865 nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
3866 tp->data_segs_out, TCP_NLA_PAD);
3867 nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
3868 tp->total_retrans, TCP_NLA_PAD);
3869
3870 rate = READ_ONCE(sk->sk_pacing_rate);
3871 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3872 nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
3873
3874 rate64 = tcp_compute_delivery_rate(tp);
3875 nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
3876
3877 nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
3878 nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
3879 nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
3880
3881 nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
3882 nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
3883 nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
3884 nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
3885 nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
3886
3887 nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
3888 nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
3889
3890 nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
3891 TCP_NLA_PAD);
3892 nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
3893 TCP_NLA_PAD);
3894 nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
3895 nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
3896 nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
3897 nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
3898 nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
3899 max_t(int, 0, tp->write_seq - tp->snd_nxt));
3900 nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
3901 TCP_NLA_PAD);
3902 if (ack_skb)
3903 nla_put_u8(stats, TCP_NLA_TTL,
3904 tcp_skb_ttl_or_hop_limit(ack_skb));
3905
3906 return stats;
3907}
3908
3909static int do_tcp_getsockopt(struct sock *sk, int level,
3910 int optname, char __user *optval, int __user *optlen)
3911{
3912 struct inet_connection_sock *icsk = inet_csk(sk);
3913 struct tcp_sock *tp = tcp_sk(sk);
3914 struct net *net = sock_net(sk);
3915 int val, len;
3916
3917 if (get_user(len, optlen))
3918 return -EFAULT;
3919
3920 len = min_t(unsigned int, len, sizeof(int));
3921
3922 if (len < 0)
3923 return -EINVAL;
3924
3925 switch (optname) {
3926 case TCP_MAXSEG:
3927 val = tp->mss_cache;
3928 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3929 val = tp->rx_opt.user_mss;
3930 if (tp->repair)
3931 val = tp->rx_opt.mss_clamp;
3932 break;
3933 case TCP_NODELAY:
3934 val = !!(tp->nonagle&TCP_NAGLE_OFF);
3935 break;
3936 case TCP_CORK:
3937 val = !!(tp->nonagle&TCP_NAGLE_CORK);
3938 break;
3939 case TCP_KEEPIDLE:
3940 val = keepalive_time_when(tp) / HZ;
3941 break;
3942 case TCP_KEEPINTVL:
3943 val = keepalive_intvl_when(tp) / HZ;
3944 break;
3945 case TCP_KEEPCNT:
3946 val = keepalive_probes(tp);
3947 break;
3948 case TCP_SYNCNT:
3949 val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
3950 break;
3951 case TCP_LINGER2:
3952 val = tp->linger2;
3953 if (val >= 0)
3954 val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
3955 break;
3956 case TCP_DEFER_ACCEPT:
3957 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
3958 TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
3959 break;
3960 case TCP_WINDOW_CLAMP:
3961 val = tp->window_clamp;
3962 break;
3963 case TCP_INFO: {
3964 struct tcp_info info;
3965
3966 if (get_user(len, optlen))
3967 return -EFAULT;
3968
3969 tcp_get_info(sk, &info);
3970
3971 len = min_t(unsigned int, len, sizeof(info));
3972 if (put_user(len, optlen))
3973 return -EFAULT;
3974 if (copy_to_user(optval, &info, len))
3975 return -EFAULT;
3976 return 0;
3977 }
3978 case TCP_CC_INFO: {
3979 const struct tcp_congestion_ops *ca_ops;
3980 union tcp_cc_info info;
3981 size_t sz = 0;
3982 int attr;
3983
3984 if (get_user(len, optlen))
3985 return -EFAULT;
3986
3987 ca_ops = icsk->icsk_ca_ops;
3988 if (ca_ops && ca_ops->get_info)
3989 sz = ca_ops->get_info(sk, ~0U, &attr, &info);
3990
3991 len = min_t(unsigned int, len, sz);
3992 if (put_user(len, optlen))
3993 return -EFAULT;
3994 if (copy_to_user(optval, &info, len))
3995 return -EFAULT;
3996 return 0;
3997 }
3998 case TCP_QUICKACK:
3999 val = !inet_csk_in_pingpong_mode(sk);
4000 break;
4001
4002 case TCP_CONGESTION:
4003 if (get_user(len, optlen))
4004 return -EFAULT;
4005 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
4006 if (put_user(len, optlen))
4007 return -EFAULT;
4008 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
4009 return -EFAULT;
4010 return 0;
4011
4012 case TCP_ULP:
4013 if (get_user(len, optlen))
4014 return -EFAULT;
4015 len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
4016 if (!icsk->icsk_ulp_ops) {
4017 if (put_user(0, optlen))
4018 return -EFAULT;
4019 return 0;
4020 }
4021 if (put_user(len, optlen))
4022 return -EFAULT;
4023 if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
4024 return -EFAULT;
4025 return 0;
4026
4027 case TCP_FASTOPEN_KEY: {
4028 u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)];
4029 unsigned int key_len;
4030
4031 if (get_user(len, optlen))
4032 return -EFAULT;
4033
4034 key_len = tcp_fastopen_get_cipher(net, icsk, key) *
4035 TCP_FASTOPEN_KEY_LENGTH;
4036 len = min_t(unsigned int, len, key_len);
4037 if (put_user(len, optlen))
4038 return -EFAULT;
4039 if (copy_to_user(optval, key, len))
4040 return -EFAULT;
4041 return 0;
4042 }
4043 case TCP_THIN_LINEAR_TIMEOUTS:
4044 val = tp->thin_lto;
4045 break;
4046
4047 case TCP_THIN_DUPACK:
4048 val = 0;
4049 break;
4050
4051 case TCP_REPAIR:
4052 val = tp->repair;
4053 break;
4054
4055 case TCP_REPAIR_QUEUE:
4056 if (tp->repair)
4057 val = tp->repair_queue;
4058 else
4059 return -EINVAL;
4060 break;
4061
4062 case TCP_REPAIR_WINDOW: {
4063 struct tcp_repair_window opt;
4064
4065 if (get_user(len, optlen))
4066 return -EFAULT;
4067
4068 if (len != sizeof(opt))
4069 return -EINVAL;
4070
4071 if (!tp->repair)
4072 return -EPERM;
4073
4074 opt.snd_wl1 = tp->snd_wl1;
4075 opt.snd_wnd = tp->snd_wnd;
4076 opt.max_window = tp->max_window;
4077 opt.rcv_wnd = tp->rcv_wnd;
4078 opt.rcv_wup = tp->rcv_wup;
4079
4080 if (copy_to_user(optval, &opt, len))
4081 return -EFAULT;
4082 return 0;
4083 }
4084 case TCP_QUEUE_SEQ:
4085 if (tp->repair_queue == TCP_SEND_QUEUE)
4086 val = tp->write_seq;
4087 else if (tp->repair_queue == TCP_RECV_QUEUE)
4088 val = tp->rcv_nxt;
4089 else
4090 return -EINVAL;
4091 break;
4092
4093 case TCP_USER_TIMEOUT:
4094 val = icsk->icsk_user_timeout;
4095 break;
4096
4097 case TCP_FASTOPEN:
4098 val = icsk->icsk_accept_queue.fastopenq.max_qlen;
4099 break;
4100
4101 case TCP_FASTOPEN_CONNECT:
4102 val = tp->fastopen_connect;
4103 break;
4104
4105 case TCP_FASTOPEN_NO_COOKIE:
4106 val = tp->fastopen_no_cookie;
4107 break;
4108
4109 case TCP_TX_DELAY:
4110 val = tp->tcp_tx_delay;
4111 break;
4112
4113 case TCP_TIMESTAMP:
4114 val = tcp_time_stamp_raw() + tp->tsoffset;
4115 break;
4116 case TCP_NOTSENT_LOWAT:
4117 val = tp->notsent_lowat;
4118 break;
4119 case TCP_INQ:
4120 val = tp->recvmsg_inq;
4121 break;
4122 case TCP_SAVE_SYN:
4123 val = tp->save_syn;
4124 break;
4125 case TCP_SAVED_SYN: {
4126 if (get_user(len, optlen))
4127 return -EFAULT;
4128
4129 lock_sock(sk);
4130 if (tp->saved_syn) {
4131 if (len < tcp_saved_syn_len(tp->saved_syn)) {
4132 if (put_user(tcp_saved_syn_len(tp->saved_syn),
4133 optlen)) {
4134 release_sock(sk);
4135 return -EFAULT;
4136 }
4137 release_sock(sk);
4138 return -EINVAL;
4139 }
4140 len = tcp_saved_syn_len(tp->saved_syn);
4141 if (put_user(len, optlen)) {
4142 release_sock(sk);
4143 return -EFAULT;
4144 }
4145 if (copy_to_user(optval, tp->saved_syn->data, len)) {
4146 release_sock(sk);
4147 return -EFAULT;
4148 }
4149 tcp_saved_syn_free(tp);
4150 release_sock(sk);
4151 } else {
4152 release_sock(sk);
4153 len = 0;
4154 if (put_user(len, optlen))
4155 return -EFAULT;
4156 }
4157 return 0;
4158 }
4159#ifdef CONFIG_MMU
4160 case TCP_ZEROCOPY_RECEIVE: {
4161 struct scm_timestamping_internal tss;
4162 struct tcp_zerocopy_receive zc = {};
4163 int err;
4164
4165 if (get_user(len, optlen))
4166 return -EFAULT;
4167 if (len < 0 ||
4168 len < offsetofend(struct tcp_zerocopy_receive, length))
4169 return -EINVAL;
4170 if (unlikely(len > sizeof(zc))) {
4171 err = check_zeroed_user(optval + sizeof(zc),
4172 len - sizeof(zc));
4173 if (err < 1)
4174 return err == 0 ? -EINVAL : err;
4175 len = sizeof(zc);
4176 if (put_user(len, optlen))
4177 return -EFAULT;
4178 }
4179 if (copy_from_user(&zc, optval, len))
4180 return -EFAULT;
4181 if (zc.reserved)
4182 return -EINVAL;
4183 if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS))
4184 return -EINVAL;
4185 lock_sock(sk);
4186 err = tcp_zerocopy_receive(sk, &zc, &tss);
4187 err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
4188 &zc, &len, err);
4189 release_sock(sk);
4190 if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
4191 goto zerocopy_rcv_cmsg;
4192 switch (len) {
4193 case offsetofend(struct tcp_zerocopy_receive, msg_flags):
4194 goto zerocopy_rcv_cmsg;
4195 case offsetofend(struct tcp_zerocopy_receive, msg_controllen):
4196 case offsetofend(struct tcp_zerocopy_receive, msg_control):
4197 case offsetofend(struct tcp_zerocopy_receive, flags):
4198 case offsetofend(struct tcp_zerocopy_receive, copybuf_len):
4199 case offsetofend(struct tcp_zerocopy_receive, copybuf_address):
4200 case offsetofend(struct tcp_zerocopy_receive, err):
4201 goto zerocopy_rcv_sk_err;
4202 case offsetofend(struct tcp_zerocopy_receive, inq):
4203 goto zerocopy_rcv_inq;
4204 case offsetofend(struct tcp_zerocopy_receive, length):
4205 default:
4206 goto zerocopy_rcv_out;
4207 }
4208zerocopy_rcv_cmsg:
4209 if (zc.msg_flags & TCP_CMSG_TS)
4210 tcp_zc_finalize_rx_tstamp(sk, &zc, &tss);
4211 else
4212 zc.msg_flags = 0;
4213zerocopy_rcv_sk_err:
4214 if (!err)
4215 zc.err = sock_error(sk);
4216zerocopy_rcv_inq:
4217 zc.inq = tcp_inq_hint(sk);
4218zerocopy_rcv_out:
4219 if (!err && copy_to_user(optval, &zc, len))
4220 err = -EFAULT;
4221 return err;
4222 }
4223#endif
4224 default:
4225 return -ENOPROTOOPT;
4226 }
4227
4228 if (put_user(len, optlen))
4229 return -EFAULT;
4230 if (copy_to_user(optval, &val, len))
4231 return -EFAULT;
4232 return 0;
4233}
4234
4235bool tcp_bpf_bypass_getsockopt(int level, int optname)
4236{
4237
4238
4239
4240 if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
4241 return true;
4242
4243 return false;
4244}
4245EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt);
4246
4247int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
4248 int __user *optlen)
4249{
4250 struct inet_connection_sock *icsk = inet_csk(sk);
4251
4252 if (level != SOL_TCP)
4253 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
4254 optval, optlen);
4255 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
4256}
4257EXPORT_SYMBOL(tcp_getsockopt);
4258
4259#ifdef CONFIG_TCP_MD5SIG
4260static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
4261static DEFINE_MUTEX(tcp_md5sig_mutex);
4262static bool tcp_md5sig_pool_populated = false;
4263
4264static void __tcp_alloc_md5sig_pool(void)
4265{
4266 struct crypto_ahash *hash;
4267 int cpu;
4268
4269 hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
4270 if (IS_ERR(hash))
4271 return;
4272
4273 for_each_possible_cpu(cpu) {
4274 void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
4275 struct ahash_request *req;
4276
4277 if (!scratch) {
4278 scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
4279 sizeof(struct tcphdr),
4280 GFP_KERNEL,
4281 cpu_to_node(cpu));
4282 if (!scratch)
4283 return;
4284 per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
4285 }
4286 if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
4287 continue;
4288
4289 req = ahash_request_alloc(hash, GFP_KERNEL);
4290 if (!req)
4291 return;
4292
4293 ahash_request_set_callback(req, 0, NULL, NULL);
4294
4295 per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
4296 }
4297
4298
4299
4300 smp_wmb();
4301 tcp_md5sig_pool_populated = true;
4302}
4303
4304bool tcp_alloc_md5sig_pool(void)
4305{
4306 if (unlikely(!tcp_md5sig_pool_populated)) {
4307 mutex_lock(&tcp_md5sig_mutex);
4308
4309 if (!tcp_md5sig_pool_populated) {
4310 __tcp_alloc_md5sig_pool();
4311 if (tcp_md5sig_pool_populated)
4312 static_branch_inc(&tcp_md5_needed);
4313 }
4314
4315 mutex_unlock(&tcp_md5sig_mutex);
4316 }
4317 return tcp_md5sig_pool_populated;
4318}
4319EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
4330{
4331 local_bh_disable();
4332
4333 if (tcp_md5sig_pool_populated) {
4334
4335 smp_rmb();
4336 return this_cpu_ptr(&tcp_md5sig_pool);
4337 }
4338 local_bh_enable();
4339 return NULL;
4340}
4341EXPORT_SYMBOL(tcp_get_md5sig_pool);
4342
4343int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
4344 const struct sk_buff *skb, unsigned int header_len)
4345{
4346 struct scatterlist sg;
4347 const struct tcphdr *tp = tcp_hdr(skb);
4348 struct ahash_request *req = hp->md5_req;
4349 unsigned int i;
4350 const unsigned int head_data_len = skb_headlen(skb) > header_len ?
4351 skb_headlen(skb) - header_len : 0;
4352 const struct skb_shared_info *shi = skb_shinfo(skb);
4353 struct sk_buff *frag_iter;
4354
4355 sg_init_table(&sg, 1);
4356
4357 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
4358 ahash_request_set_crypt(req, &sg, NULL, head_data_len);
4359 if (crypto_ahash_update(req))
4360 return 1;
4361
4362 for (i = 0; i < shi->nr_frags; ++i) {
4363 const skb_frag_t *f = &shi->frags[i];
4364 unsigned int offset = skb_frag_off(f);
4365 struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
4366
4367 sg_set_page(&sg, page, skb_frag_size(f),
4368 offset_in_page(offset));
4369 ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
4370 if (crypto_ahash_update(req))
4371 return 1;
4372 }
4373
4374 skb_walk_frags(skb, frag_iter)
4375 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
4376 return 1;
4377
4378 return 0;
4379}
4380EXPORT_SYMBOL(tcp_md5_hash_skb_data);
4381
4382int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
4383{
4384 u8 keylen = READ_ONCE(key->keylen);
4385 struct scatterlist sg;
4386
4387 sg_init_one(&sg, key->key, keylen);
4388 ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen);
4389
4390
4391 return data_race(crypto_ahash_update(hp->md5_req));
4392}
4393EXPORT_SYMBOL(tcp_md5_hash_key);
4394
4395#endif
4396
4397void tcp_done(struct sock *sk)
4398{
4399 struct request_sock *req;
4400
4401
4402
4403
4404
4405 req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);
4406
4407 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
4408 TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
4409
4410 tcp_set_state(sk, TCP_CLOSE);
4411 tcp_clear_xmit_timers(sk);
4412 if (req)
4413 reqsk_fastopen_remove(sk, req, false);
4414
4415 sk->sk_shutdown = SHUTDOWN_MASK;
4416
4417 if (!sock_flag(sk, SOCK_DEAD))
4418 sk->sk_state_change(sk);
4419 else
4420 inet_csk_destroy_sock(sk);
4421}
4422EXPORT_SYMBOL_GPL(tcp_done);
4423
4424int tcp_abort(struct sock *sk, int err)
4425{
4426 if (!sk_fullsock(sk)) {
4427 if (sk->sk_state == TCP_NEW_SYN_RECV) {
4428 struct request_sock *req = inet_reqsk(sk);
4429
4430 local_bh_disable();
4431 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
4432 local_bh_enable();
4433 return 0;
4434 }
4435 return -EOPNOTSUPP;
4436 }
4437
4438
4439 lock_sock(sk);
4440
4441 if (sk->sk_state == TCP_LISTEN) {
4442 tcp_set_state(sk, TCP_CLOSE);
4443 inet_csk_listen_stop(sk);
4444 }
4445
4446
4447 local_bh_disable();
4448 bh_lock_sock(sk);
4449
4450 if (!sock_flag(sk, SOCK_DEAD)) {
4451 sk->sk_err = err;
4452
4453 smp_wmb();
4454 sk_error_report(sk);
4455 if (tcp_need_reset(sk->sk_state))
4456 tcp_send_active_reset(sk, GFP_ATOMIC);
4457 tcp_done(sk);
4458 }
4459
4460 bh_unlock_sock(sk);
4461 local_bh_enable();
4462 tcp_write_queue_purge(sk);
4463 release_sock(sk);
4464 return 0;
4465}
4466EXPORT_SYMBOL_GPL(tcp_abort);
4467
4468extern struct tcp_congestion_ops tcp_reno;
4469
4470static __initdata unsigned long thash_entries;
4471static int __init set_thash_entries(char *str)
4472{
4473 ssize_t ret;
4474
4475 if (!str)
4476 return 0;
4477
4478 ret = kstrtoul(str, 0, &thash_entries);
4479 if (ret)
4480 return 0;
4481
4482 return 1;
4483}
4484__setup("thash_entries=", set_thash_entries);
4485
4486static void __init tcp_init_mem(void)
4487{
4488 unsigned long limit = nr_free_buffer_pages() / 16;
4489
4490 limit = max(limit, 128UL);
4491 sysctl_tcp_mem[0] = limit / 4 * 3;
4492 sysctl_tcp_mem[1] = limit;
4493 sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
4494}
4495
4496void __init tcp_init(void)
4497{
4498 int max_rshare, max_wshare, cnt;
4499 unsigned long limit;
4500 unsigned int i;
4501
4502 BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
4503 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
4504 sizeof_field(struct sk_buff, cb));
4505
4506 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
4507 percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
4508 inet_hashinfo_init(&tcp_hashinfo);
4509 inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
4510 thash_entries, 21,
4511 0, 64 * 1024);
4512 tcp_hashinfo.bind_bucket_cachep =
4513 kmem_cache_create("tcp_bind_bucket",
4514 sizeof(struct inet_bind_bucket), 0,
4515 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
4516
4517
4518
4519
4520
4521
4522 tcp_hashinfo.ehash =
4523 alloc_large_system_hash("TCP established",
4524 sizeof(struct inet_ehash_bucket),
4525 thash_entries,
4526 17,
4527 0,
4528 NULL,
4529 &tcp_hashinfo.ehash_mask,
4530 0,
4531 thash_entries ? 0 : 512 * 1024);
4532 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
4533 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
4534
4535 if (inet_ehash_locks_alloc(&tcp_hashinfo))
4536 panic("TCP: failed to alloc ehash_locks");
4537 tcp_hashinfo.bhash =
4538 alloc_large_system_hash("TCP bind",
4539 sizeof(struct inet_bind_hashbucket),
4540 tcp_hashinfo.ehash_mask + 1,
4541 17,
4542 0,
4543 &tcp_hashinfo.bhash_size,
4544 NULL,
4545 0,
4546 64 * 1024);
4547 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
4548 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
4549 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
4550 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
4551 }
4552
4553
4554 cnt = tcp_hashinfo.ehash_mask + 1;
4555 sysctl_tcp_max_orphans = cnt / 2;
4556
4557 tcp_init_mem();
4558
4559 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
4560 max_wshare = min(4UL*1024*1024, limit);
4561 max_rshare = min(6UL*1024*1024, limit);
4562
4563 init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
4564 init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
4565 init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
4566
4567 init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
4568 init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
4569 init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
4570
4571 pr_info("Hash tables configured (established %u bind %u)\n",
4572 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
4573
4574 tcp_v4_init();
4575 tcp_metrics_init();
4576 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
4577 tcp_tasklet_init();
4578 mptcp_init();
4579}
4580