1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244#define pr_fmt(fmt) "TCP: " fmt
245
246#include <crypto/hash.h>
247#include <linux/kernel.h>
248#include <linux/module.h>
249#include <linux/types.h>
250#include <linux/fcntl.h>
251#include <linux/poll.h>
252#include <linux/inet_diag.h>
253#include <linux/init.h>
254#include <linux/fs.h>
255#include <linux/skbuff.h>
256#include <linux/scatterlist.h>
257#include <linux/splice.h>
258#include <linux/net.h>
259#include <linux/socket.h>
260#include <linux/random.h>
261#include <linux/memblock.h>
262#include <linux/highmem.h>
263#include <linux/cache.h>
264#include <linux/err.h>
265#include <linux/time.h>
266#include <linux/slab.h>
267#include <linux/errqueue.h>
268#include <linux/static_key.h>
269#include <linux/btf.h>
270
271#include <net/icmp.h>
272#include <net/inet_common.h>
273#include <net/tcp.h>
274#include <net/mptcp.h>
275#include <net/xfrm.h>
276#include <net/ip.h>
277#include <net/sock.h>
278
279#include <linux/uaccess.h>
280#include <asm/ioctls.h>
281#include <net/busy_poll.h>
282
283
284enum {
285 TCP_CMSG_INQ = 1,
286 TCP_CMSG_TS = 2
287};
288
289DEFINE_PER_CPU(unsigned int, tcp_orphan_count);
290EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
291
292long sysctl_tcp_mem[3] __read_mostly;
293EXPORT_SYMBOL(sysctl_tcp_mem);
294
295atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp;
296EXPORT_SYMBOL(tcp_memory_allocated);
297
298#if IS_ENABLED(CONFIG_SMC)
299DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
300EXPORT_SYMBOL(tcp_have_smc);
301#endif
302
303
304
305
306struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;
307EXPORT_SYMBOL(tcp_sockets_allocated);
308
309
310
311
312struct tcp_splice_state {
313 struct pipe_inode_info *pipe;
314 size_t len;
315 unsigned int flags;
316};
317
318
319
320
321
322
323
324unsigned long tcp_memory_pressure __read_mostly;
325EXPORT_SYMBOL_GPL(tcp_memory_pressure);
326
327void tcp_enter_memory_pressure(struct sock *sk)
328{
329 unsigned long val;
330
331 if (READ_ONCE(tcp_memory_pressure))
332 return;
333 val = jiffies;
334
335 if (!val)
336 val--;
337 if (!cmpxchg(&tcp_memory_pressure, 0, val))
338 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
339}
340EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
341
342void tcp_leave_memory_pressure(struct sock *sk)
343{
344 unsigned long val;
345
346 if (!READ_ONCE(tcp_memory_pressure))
347 return;
348 val = xchg(&tcp_memory_pressure, 0);
349 if (val)
350 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
351 jiffies_to_msecs(jiffies - val));
352}
353EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
354
355
356static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
357{
358 u8 res = 0;
359
360 if (seconds > 0) {
361 int period = timeout;
362
363 res = 1;
364 while (seconds > period && res < 255) {
365 res++;
366 timeout <<= 1;
367 if (timeout > rto_max)
368 timeout = rto_max;
369 period += timeout;
370 }
371 }
372 return res;
373}
374
375
376static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
377{
378 int period = 0;
379
380 if (retrans > 0) {
381 period = timeout;
382 while (--retrans) {
383 timeout <<= 1;
384 if (timeout > rto_max)
385 timeout = rto_max;
386 period += timeout;
387 }
388 }
389 return period;
390}
391
392static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
393{
394 u32 rate = READ_ONCE(tp->rate_delivered);
395 u32 intv = READ_ONCE(tp->rate_interval_us);
396 u64 rate64 = 0;
397
398 if (rate && intv) {
399 rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
400 do_div(rate64, intv);
401 }
402 return rate64;
403}
404
405
406
407
408
409
410void tcp_init_sock(struct sock *sk)
411{
412 struct inet_connection_sock *icsk = inet_csk(sk);
413 struct tcp_sock *tp = tcp_sk(sk);
414
415 tp->out_of_order_queue = RB_ROOT;
416 sk->tcp_rtx_queue = RB_ROOT;
417 tcp_init_xmit_timers(sk);
418 INIT_LIST_HEAD(&tp->tsq_node);
419 INIT_LIST_HEAD(&tp->tsorted_sent_queue);
420
421 icsk->icsk_rto = TCP_TIMEOUT_INIT;
422 icsk->icsk_rto_min = TCP_RTO_MIN;
423 icsk->icsk_delack_max = TCP_DELACK_MAX;
424 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
425 minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
426
427
428
429
430
431
432 tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
433
434
435 tp->app_limited = ~0U;
436
437
438
439
440 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
441 tp->snd_cwnd_clamp = ~0;
442 tp->mss_cache = TCP_MSS_DEFAULT;
443
444 tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering);
445 tcp_assign_congestion_control(sk);
446
447 tp->tsoffset = 0;
448 tp->rack.reo_wnd_steps = 1;
449
450 sk->sk_write_space = sk_stream_write_space;
451 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
452
453 icsk->icsk_sync_mss = tcp_sync_mss;
454
455 WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
456 WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
457
458 sk_sockets_allocated_inc(sk);
459}
460EXPORT_SYMBOL(tcp_init_sock);
461
462static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
463{
464 struct sk_buff *skb = tcp_write_queue_tail(sk);
465
466 if (tsflags && skb) {
467 struct skb_shared_info *shinfo = skb_shinfo(skb);
468 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
469
470 sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
471 if (tsflags & SOF_TIMESTAMPING_TX_ACK)
472 tcb->txstamp_ack = 1;
473 if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
474 shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
475 }
476}
477
478static bool tcp_stream_is_readable(struct sock *sk, int target)
479{
480 if (tcp_epollin_ready(sk, target))
481 return true;
482 return sk_is_readable(sk);
483}
484
485
486
487
488
489
490
491
492__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
493{
494 __poll_t mask;
495 struct sock *sk = sock->sk;
496 const struct tcp_sock *tp = tcp_sk(sk);
497 int state;
498
499 sock_poll_wait(file, sock, wait);
500
501 state = inet_sk_state_load(sk);
502 if (state == TCP_LISTEN)
503 return inet_csk_listen_poll(sk);
504
505
506
507
508
509
510 mask = 0;
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539 if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
540 mask |= EPOLLHUP;
541 if (sk->sk_shutdown & RCV_SHUTDOWN)
542 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
543
544
545 if (state != TCP_SYN_SENT &&
546 (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
547 int target = sock_rcvlowat(sk, 0, INT_MAX);
548 u16 urg_data = READ_ONCE(tp->urg_data);
549
550 if (unlikely(urg_data) &&
551 READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
552 !sock_flag(sk, SOCK_URGINLINE))
553 target++;
554
555 if (tcp_stream_is_readable(sk, target))
556 mask |= EPOLLIN | EPOLLRDNORM;
557
558 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
559 if (__sk_stream_is_writeable(sk, 1)) {
560 mask |= EPOLLOUT | EPOLLWRNORM;
561 } else {
562 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
563 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
564
565
566
567
568
569
570 smp_mb__after_atomic();
571 if (__sk_stream_is_writeable(sk, 1))
572 mask |= EPOLLOUT | EPOLLWRNORM;
573 }
574 } else
575 mask |= EPOLLOUT | EPOLLWRNORM;
576
577 if (urg_data & TCP_URG_VALID)
578 mask |= EPOLLPRI;
579 } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
580
581
582
583
584 mask |= EPOLLOUT | EPOLLWRNORM;
585 }
586
587 smp_rmb();
588 if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
589 mask |= EPOLLERR;
590
591 return mask;
592}
593EXPORT_SYMBOL(tcp_poll);
594
595int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
596{
597 struct tcp_sock *tp = tcp_sk(sk);
598 int answ;
599 bool slow;
600
601 switch (cmd) {
602 case SIOCINQ:
603 if (sk->sk_state == TCP_LISTEN)
604 return -EINVAL;
605
606 slow = lock_sock_fast(sk);
607 answ = tcp_inq(sk);
608 unlock_sock_fast(sk, slow);
609 break;
610 case SIOCATMARK:
611 answ = READ_ONCE(tp->urg_data) &&
612 READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
613 break;
614 case SIOCOUTQ:
615 if (sk->sk_state == TCP_LISTEN)
616 return -EINVAL;
617
618 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
619 answ = 0;
620 else
621 answ = READ_ONCE(tp->write_seq) - tp->snd_una;
622 break;
623 case SIOCOUTQNSD:
624 if (sk->sk_state == TCP_LISTEN)
625 return -EINVAL;
626
627 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
628 answ = 0;
629 else
630 answ = READ_ONCE(tp->write_seq) -
631 READ_ONCE(tp->snd_nxt);
632 break;
633 default:
634 return -ENOIOCTLCMD;
635 }
636
637 return put_user(answ, (int __user *)arg);
638}
639EXPORT_SYMBOL(tcp_ioctl);
640
641void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
642{
643 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
644 tp->pushed_seq = tp->write_seq;
645}
646
647static inline bool forced_push(const struct tcp_sock *tp)
648{
649 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
650}
651
652void tcp_skb_entail(struct sock *sk, struct sk_buff *skb)
653{
654 struct tcp_sock *tp = tcp_sk(sk);
655 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
656
657 tcb->seq = tcb->end_seq = tp->write_seq;
658 tcb->tcp_flags = TCPHDR_ACK;
659 __skb_header_release(skb);
660 tcp_add_write_queue_tail(sk, skb);
661 sk_wmem_queued_add(sk, skb->truesize);
662 sk_mem_charge(sk, skb->truesize);
663 if (tp->nonagle & TCP_NAGLE_PUSH)
664 tp->nonagle &= ~TCP_NAGLE_PUSH;
665
666 tcp_slow_start_after_idle_check(sk);
667}
668
669static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
670{
671 if (flags & MSG_OOB)
672 tp->snd_up = tp->write_seq;
673}
674
675
676
677
678
679
680
681
682
683
684
685static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
686 int size_goal)
687{
688 return skb->len < size_goal &&
689 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) &&
690 !tcp_rtx_queue_empty(sk) &&
691 refcount_read(&sk->sk_wmem_alloc) > skb->truesize &&
692 tcp_skb_can_collapse_to(skb);
693}
694
695void tcp_push(struct sock *sk, int flags, int mss_now,
696 int nonagle, int size_goal)
697{
698 struct tcp_sock *tp = tcp_sk(sk);
699 struct sk_buff *skb;
700
701 skb = tcp_write_queue_tail(sk);
702 if (!skb)
703 return;
704 if (!(flags & MSG_MORE) || forced_push(tp))
705 tcp_mark_push(tp, skb);
706
707 tcp_mark_urg(tp, flags);
708
709 if (tcp_should_autocork(sk, skb, size_goal)) {
710
711
712 if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
713 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
714 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
715 }
716
717
718
719 if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
720 return;
721 }
722
723 if (flags & MSG_MORE)
724 nonagle = TCP_NAGLE_CORK;
725
726 __tcp_push_pending_frames(sk, mss_now, nonagle);
727}
728
729static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
730 unsigned int offset, size_t len)
731{
732 struct tcp_splice_state *tss = rd_desc->arg.data;
733 int ret;
734
735 ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
736 min(rd_desc->count, len), tss->flags);
737 if (ret > 0)
738 rd_desc->count -= ret;
739 return ret;
740}
741
742static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
743{
744
745 read_descriptor_t rd_desc = {
746 .arg.data = tss,
747 .count = tss->len,
748 };
749
750 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
751}
752
753
754
755
756
757
758
759
760
761
762
763
764
765ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
766 struct pipe_inode_info *pipe, size_t len,
767 unsigned int flags)
768{
769 struct sock *sk = sock->sk;
770 struct tcp_splice_state tss = {
771 .pipe = pipe,
772 .len = len,
773 .flags = flags,
774 };
775 long timeo;
776 ssize_t spliced;
777 int ret;
778
779 sock_rps_record_flow(sk);
780
781
782
783 if (unlikely(*ppos))
784 return -ESPIPE;
785
786 ret = spliced = 0;
787
788 lock_sock(sk);
789
790 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
791 while (tss.len) {
792 ret = __tcp_splice_read(sk, &tss);
793 if (ret < 0)
794 break;
795 else if (!ret) {
796 if (spliced)
797 break;
798 if (sock_flag(sk, SOCK_DONE))
799 break;
800 if (sk->sk_err) {
801 ret = sock_error(sk);
802 break;
803 }
804 if (sk->sk_shutdown & RCV_SHUTDOWN)
805 break;
806 if (sk->sk_state == TCP_CLOSE) {
807
808
809
810
811 ret = -ENOTCONN;
812 break;
813 }
814 if (!timeo) {
815 ret = -EAGAIN;
816 break;
817 }
818
819
820
821
822 if (!skb_queue_empty(&sk->sk_receive_queue))
823 break;
824 sk_wait_data(sk, &timeo, NULL);
825 if (signal_pending(current)) {
826 ret = sock_intr_errno(timeo);
827 break;
828 }
829 continue;
830 }
831 tss.len -= ret;
832 spliced += ret;
833
834 if (!timeo)
835 break;
836 release_sock(sk);
837 lock_sock(sk);
838
839 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
840 (sk->sk_shutdown & RCV_SHUTDOWN) ||
841 signal_pending(current))
842 break;
843 }
844
845 release_sock(sk);
846
847 if (spliced)
848 return spliced;
849
850 return ret;
851}
852EXPORT_SYMBOL(tcp_splice_read);
853
854struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
855 bool force_schedule)
856{
857 struct sk_buff *skb;
858
859 if (unlikely(tcp_under_memory_pressure(sk)))
860 sk_mem_reclaim_partial(sk);
861
862 skb = alloc_skb_fclone(size + MAX_TCP_HEADER, gfp);
863 if (likely(skb)) {
864 bool mem_scheduled;
865
866 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
867 if (force_schedule) {
868 mem_scheduled = true;
869 sk_forced_mem_schedule(sk, skb->truesize);
870 } else {
871 mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
872 }
873 if (likely(mem_scheduled)) {
874 skb_reserve(skb, MAX_TCP_HEADER);
875 skb->ip_summed = CHECKSUM_PARTIAL;
876 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
877 return skb;
878 }
879 __kfree_skb(skb);
880 } else {
881 sk->sk_prot->enter_memory_pressure(sk);
882 sk_stream_moderate_sndbuf(sk);
883 }
884 return NULL;
885}
886
887static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
888 int large_allowed)
889{
890 struct tcp_sock *tp = tcp_sk(sk);
891 u32 new_size_goal, size_goal;
892
893 if (!large_allowed)
894 return mss_now;
895
896
897 new_size_goal = tcp_bound_to_half_wnd(tp, sk->sk_gso_max_size);
898
899
900 size_goal = tp->gso_segs * mss_now;
901 if (unlikely(new_size_goal < size_goal ||
902 new_size_goal >= size_goal + mss_now)) {
903 tp->gso_segs = min_t(u16, new_size_goal / mss_now,
904 sk->sk_gso_max_segs);
905 size_goal = tp->gso_segs * mss_now;
906 }
907
908 return max(size_goal, mss_now);
909}
910
911int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
912{
913 int mss_now;
914
915 mss_now = tcp_current_mss(sk);
916 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
917
918 return mss_now;
919}
920
921
922
923
924
925
926
927void tcp_remove_empty_skb(struct sock *sk)
928{
929 struct sk_buff *skb = tcp_write_queue_tail(sk);
930
931 if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
932 tcp_unlink_write_queue(skb, sk);
933 if (tcp_write_queue_empty(sk))
934 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
935 tcp_wmem_free_skb(sk, skb);
936 }
937}
938
939
940static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb)
941{
942 if (unlikely(skb_zcopy_pure(skb))) {
943 u32 extra = skb->truesize -
944 SKB_TRUESIZE(skb_end_offset(skb));
945
946 if (!sk_wmem_schedule(sk, extra))
947 return -ENOMEM;
948
949 sk_mem_charge(sk, extra);
950 skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY;
951 }
952 return 0;
953}
954
955static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
956 struct page *page, int offset, size_t *size)
957{
958 struct sk_buff *skb = tcp_write_queue_tail(sk);
959 struct tcp_sock *tp = tcp_sk(sk);
960 bool can_coalesce;
961 int copy, i;
962
963 if (!skb || (copy = size_goal - skb->len) <= 0 ||
964 !tcp_skb_can_collapse_to(skb)) {
965new_segment:
966 if (!sk_stream_memory_free(sk))
967 return NULL;
968
969 skb = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation,
970 tcp_rtx_and_write_queues_empty(sk));
971 if (!skb)
972 return NULL;
973
974#ifdef CONFIG_TLS_DEVICE
975 skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
976#endif
977 tcp_skb_entail(sk, skb);
978 copy = size_goal;
979 }
980
981 if (copy > *size)
982 copy = *size;
983
984 i = skb_shinfo(skb)->nr_frags;
985 can_coalesce = skb_can_coalesce(skb, i, page, offset);
986 if (!can_coalesce && i >= sysctl_max_skb_frags) {
987 tcp_mark_push(tp, skb);
988 goto new_segment;
989 }
990 if (tcp_downgrade_zcopy_pure(sk, skb) || !sk_wmem_schedule(sk, copy))
991 return NULL;
992
993 if (can_coalesce) {
994 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
995 } else {
996 get_page(page);
997 skb_fill_page_desc(skb, i, page, offset, copy);
998 }
999
1000 if (!(flags & MSG_NO_SHARED_FRAGS))
1001 skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
1002
1003 skb->len += copy;
1004 skb->data_len += copy;
1005 skb->truesize += copy;
1006 sk_wmem_queued_add(sk, copy);
1007 sk_mem_charge(sk, copy);
1008 WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
1009 TCP_SKB_CB(skb)->end_seq += copy;
1010 tcp_skb_pcount_set(skb, 0);
1011
1012 *size = copy;
1013 return skb;
1014}
1015
1016ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
1017 size_t size, int flags)
1018{
1019 struct tcp_sock *tp = tcp_sk(sk);
1020 int mss_now, size_goal;
1021 int err;
1022 ssize_t copied;
1023 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1024
1025 if (IS_ENABLED(CONFIG_DEBUG_VM) &&
1026 WARN_ONCE(!sendpage_ok(page),
1027 "page must not be a Slab one and have page_count > 0"))
1028 return -EINVAL;
1029
1030
1031
1032
1033
1034 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1035 !tcp_passive_fastopen(sk)) {
1036 err = sk_stream_wait_connect(sk, &timeo);
1037 if (err != 0)
1038 goto out_err;
1039 }
1040
1041 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1042
1043 mss_now = tcp_send_mss(sk, &size_goal, flags);
1044 copied = 0;
1045
1046 err = -EPIPE;
1047 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1048 goto out_err;
1049
1050 while (size > 0) {
1051 struct sk_buff *skb;
1052 size_t copy = size;
1053
1054 skb = tcp_build_frag(sk, size_goal, flags, page, offset, ©);
1055 if (!skb)
1056 goto wait_for_space;
1057
1058 if (!copied)
1059 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1060
1061 copied += copy;
1062 offset += copy;
1063 size -= copy;
1064 if (!size)
1065 goto out;
1066
1067 if (skb->len < size_goal || (flags & MSG_OOB))
1068 continue;
1069
1070 if (forced_push(tp)) {
1071 tcp_mark_push(tp, skb);
1072 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1073 } else if (skb == tcp_send_head(sk))
1074 tcp_push_one(sk, mss_now);
1075 continue;
1076
1077wait_for_space:
1078 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1079 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1080 TCP_NAGLE_PUSH, size_goal);
1081
1082 err = sk_stream_wait_memory(sk, &timeo);
1083 if (err != 0)
1084 goto do_error;
1085
1086 mss_now = tcp_send_mss(sk, &size_goal, flags);
1087 }
1088
1089out:
1090 if (copied) {
1091 tcp_tx_timestamp(sk, sk->sk_tsflags);
1092 if (!(flags & MSG_SENDPAGE_NOTLAST))
1093 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1094 }
1095 return copied;
1096
1097do_error:
1098 tcp_remove_empty_skb(sk);
1099 if (copied)
1100 goto out;
1101out_err:
1102
1103 if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
1104 sk->sk_write_space(sk);
1105 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1106 }
1107 return sk_stream_error(sk, flags, err);
1108}
1109EXPORT_SYMBOL_GPL(do_tcp_sendpages);
1110
1111int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
1112 size_t size, int flags)
1113{
1114 if (!(sk->sk_route_caps & NETIF_F_SG))
1115 return sock_no_sendpage_locked(sk, page, offset, size, flags);
1116
1117 tcp_rate_check_app_limited(sk);
1118
1119 return do_tcp_sendpages(sk, page, offset, size, flags);
1120}
1121EXPORT_SYMBOL_GPL(tcp_sendpage_locked);
1122
1123int tcp_sendpage(struct sock *sk, struct page *page, int offset,
1124 size_t size, int flags)
1125{
1126 int ret;
1127
1128 lock_sock(sk);
1129 ret = tcp_sendpage_locked(sk, page, offset, size, flags);
1130 release_sock(sk);
1131
1132 return ret;
1133}
1134EXPORT_SYMBOL(tcp_sendpage);
1135
1136void tcp_free_fastopen_req(struct tcp_sock *tp)
1137{
1138 if (tp->fastopen_req) {
1139 kfree(tp->fastopen_req);
1140 tp->fastopen_req = NULL;
1141 }
1142}
1143
1144static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1145 int *copied, size_t size,
1146 struct ubuf_info *uarg)
1147{
1148 struct tcp_sock *tp = tcp_sk(sk);
1149 struct inet_sock *inet = inet_sk(sk);
1150 struct sockaddr *uaddr = msg->msg_name;
1151 int err, flags;
1152
1153 if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) &
1154 TFO_CLIENT_ENABLE) ||
1155 (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
1156 uaddr->sa_family == AF_UNSPEC))
1157 return -EOPNOTSUPP;
1158 if (tp->fastopen_req)
1159 return -EALREADY;
1160
1161 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1162 sk->sk_allocation);
1163 if (unlikely(!tp->fastopen_req))
1164 return -ENOBUFS;
1165 tp->fastopen_req->data = msg;
1166 tp->fastopen_req->size = size;
1167 tp->fastopen_req->uarg = uarg;
1168
1169 if (inet->defer_connect) {
1170 err = tcp_connect(sk);
1171
1172 if (err) {
1173 tcp_set_state(sk, TCP_CLOSE);
1174 inet->inet_dport = 0;
1175 sk->sk_route_caps = 0;
1176 }
1177 }
1178 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1179 err = __inet_stream_connect(sk->sk_socket, uaddr,
1180 msg->msg_namelen, flags, 1);
1181
1182
1183
1184 if (tp->fastopen_req) {
1185 *copied = tp->fastopen_req->copied;
1186 tcp_free_fastopen_req(tp);
1187 inet->defer_connect = 0;
1188 }
1189 return err;
1190}
1191
1192int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
1193{
1194 struct tcp_sock *tp = tcp_sk(sk);
1195 struct ubuf_info *uarg = NULL;
1196 struct sk_buff *skb;
1197 struct sockcm_cookie sockc;
1198 int flags, err, copied = 0;
1199 int mss_now = 0, size_goal, copied_syn = 0;
1200 int process_backlog = 0;
1201 bool zc = false;
1202 long timeo;
1203
1204 flags = msg->msg_flags;
1205
1206 if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
1207 skb = tcp_write_queue_tail(sk);
1208 uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
1209 if (!uarg) {
1210 err = -ENOBUFS;
1211 goto out_err;
1212 }
1213
1214 zc = sk->sk_route_caps & NETIF_F_SG;
1215 if (!zc)
1216 uarg->zerocopy = 0;
1217 }
1218
1219 if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
1220 !tp->repair) {
1221 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
1222 if (err == -EINPROGRESS && copied_syn > 0)
1223 goto out;
1224 else if (err)
1225 goto out_err;
1226 }
1227
1228 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1229
1230 tcp_rate_check_app_limited(sk);
1231
1232
1233
1234
1235
1236 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1237 !tcp_passive_fastopen(sk)) {
1238 err = sk_stream_wait_connect(sk, &timeo);
1239 if (err != 0)
1240 goto do_error;
1241 }
1242
1243 if (unlikely(tp->repair)) {
1244 if (tp->repair_queue == TCP_RECV_QUEUE) {
1245 copied = tcp_send_rcvq(sk, msg, size);
1246 goto out_nopush;
1247 }
1248
1249 err = -EINVAL;
1250 if (tp->repair_queue == TCP_NO_QUEUE)
1251 goto out_err;
1252
1253
1254 }
1255
1256 sockcm_init(&sockc, sk);
1257 if (msg->msg_controllen) {
1258 err = sock_cmsg_send(sk, msg, &sockc);
1259 if (unlikely(err)) {
1260 err = -EINVAL;
1261 goto out_err;
1262 }
1263 }
1264
1265
1266 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1267
1268
1269 copied = 0;
1270
1271restart:
1272 mss_now = tcp_send_mss(sk, &size_goal, flags);
1273
1274 err = -EPIPE;
1275 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1276 goto do_error;
1277
1278 while (msg_data_left(msg)) {
1279 int copy = 0;
1280
1281 skb = tcp_write_queue_tail(sk);
1282 if (skb)
1283 copy = size_goal - skb->len;
1284
1285 if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
1286 bool first_skb;
1287
1288new_segment:
1289 if (!sk_stream_memory_free(sk))
1290 goto wait_for_space;
1291
1292 if (unlikely(process_backlog >= 16)) {
1293 process_backlog = 0;
1294 if (sk_flush_backlog(sk))
1295 goto restart;
1296 }
1297 first_skb = tcp_rtx_and_write_queues_empty(sk);
1298 skb = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation,
1299 first_skb);
1300 if (!skb)
1301 goto wait_for_space;
1302
1303 process_backlog++;
1304
1305 tcp_skb_entail(sk, skb);
1306 copy = size_goal;
1307
1308
1309
1310
1311
1312 if (tp->repair)
1313 TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
1314 }
1315
1316
1317 if (copy > msg_data_left(msg))
1318 copy = msg_data_left(msg);
1319
1320 if (!zc) {
1321 bool merge = true;
1322 int i = skb_shinfo(skb)->nr_frags;
1323 struct page_frag *pfrag = sk_page_frag(sk);
1324
1325 if (!sk_page_frag_refill(sk, pfrag))
1326 goto wait_for_space;
1327
1328 if (!skb_can_coalesce(skb, i, pfrag->page,
1329 pfrag->offset)) {
1330 if (i >= sysctl_max_skb_frags) {
1331 tcp_mark_push(tp, skb);
1332 goto new_segment;
1333 }
1334 merge = false;
1335 }
1336
1337 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1338
1339 if (tcp_downgrade_zcopy_pure(sk, skb) ||
1340 !sk_wmem_schedule(sk, copy))
1341 goto wait_for_space;
1342
1343 err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
1344 pfrag->page,
1345 pfrag->offset,
1346 copy);
1347 if (err)
1348 goto do_error;
1349
1350
1351 if (merge) {
1352 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1353 } else {
1354 skb_fill_page_desc(skb, i, pfrag->page,
1355 pfrag->offset, copy);
1356 page_ref_inc(pfrag->page);
1357 }
1358 pfrag->offset += copy;
1359 } else {
1360
1361
1362
1363 if (!skb->len)
1364 skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY;
1365
1366 if (!skb_zcopy_pure(skb)) {
1367 if (!sk_wmem_schedule(sk, copy))
1368 goto wait_for_space;
1369 }
1370
1371 err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
1372 if (err == -EMSGSIZE || err == -EEXIST) {
1373 tcp_mark_push(tp, skb);
1374 goto new_segment;
1375 }
1376 if (err < 0)
1377 goto do_error;
1378 copy = err;
1379 }
1380
1381 if (!copied)
1382 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1383
1384 WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
1385 TCP_SKB_CB(skb)->end_seq += copy;
1386 tcp_skb_pcount_set(skb, 0);
1387
1388 copied += copy;
1389 if (!msg_data_left(msg)) {
1390 if (unlikely(flags & MSG_EOR))
1391 TCP_SKB_CB(skb)->eor = 1;
1392 goto out;
1393 }
1394
1395 if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair))
1396 continue;
1397
1398 if (forced_push(tp)) {
1399 tcp_mark_push(tp, skb);
1400 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1401 } else if (skb == tcp_send_head(sk))
1402 tcp_push_one(sk, mss_now);
1403 continue;
1404
1405wait_for_space:
1406 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1407 if (copied)
1408 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1409 TCP_NAGLE_PUSH, size_goal);
1410
1411 err = sk_stream_wait_memory(sk, &timeo);
1412 if (err != 0)
1413 goto do_error;
1414
1415 mss_now = tcp_send_mss(sk, &size_goal, flags);
1416 }
1417
1418out:
1419 if (copied) {
1420 tcp_tx_timestamp(sk, sockc.tsflags);
1421 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1422 }
1423out_nopush:
1424 net_zcopy_put(uarg);
1425 return copied + copied_syn;
1426
1427do_error:
1428 tcp_remove_empty_skb(sk);
1429
1430 if (copied + copied_syn)
1431 goto out;
1432out_err:
1433 net_zcopy_put_abort(uarg, true);
1434 err = sk_stream_error(sk, flags, err);
1435
1436 if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
1437 sk->sk_write_space(sk);
1438 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1439 }
1440 return err;
1441}
1442EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
1443
1444int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
1445{
1446 int ret;
1447
1448 lock_sock(sk);
1449 ret = tcp_sendmsg_locked(sk, msg, size);
1450 release_sock(sk);
1451
1452 return ret;
1453}
1454EXPORT_SYMBOL(tcp_sendmsg);
1455
1456
1457
1458
1459
1460
1461static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1462{
1463 struct tcp_sock *tp = tcp_sk(sk);
1464
1465
1466 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1467 tp->urg_data == TCP_URG_READ)
1468 return -EINVAL;
1469
1470 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1471 return -ENOTCONN;
1472
1473 if (tp->urg_data & TCP_URG_VALID) {
1474 int err = 0;
1475 char c = tp->urg_data;
1476
1477 if (!(flags & MSG_PEEK))
1478 WRITE_ONCE(tp->urg_data, TCP_URG_READ);
1479
1480
1481 msg->msg_flags |= MSG_OOB;
1482
1483 if (len > 0) {
1484 if (!(flags & MSG_TRUNC))
1485 err = memcpy_to_msg(msg, &c, 1);
1486 len = 1;
1487 } else
1488 msg->msg_flags |= MSG_TRUNC;
1489
1490 return err ? -EFAULT : len;
1491 }
1492
1493 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1494 return 0;
1495
1496
1497
1498
1499
1500
1501
1502 return -EAGAIN;
1503}
1504
1505static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1506{
1507 struct sk_buff *skb;
1508 int copied = 0, err = 0;
1509
1510
1511
1512 skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
1513 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1514 if (err)
1515 return err;
1516 copied += skb->len;
1517 }
1518
1519 skb_queue_walk(&sk->sk_write_queue, skb) {
1520 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1521 if (err)
1522 break;
1523
1524 copied += skb->len;
1525 }
1526
1527 return err ?: copied;
1528}
1529
1530
1531
1532
1533
1534
1535
1536void tcp_cleanup_rbuf(struct sock *sk, int copied)
1537{
1538 struct tcp_sock *tp = tcp_sk(sk);
1539 bool time_to_ack = false;
1540
1541 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1542
1543 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1544 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1545 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1546
1547 if (inet_csk_ack_scheduled(sk)) {
1548 const struct inet_connection_sock *icsk = inet_csk(sk);
1549
1550 if (
1551 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1552
1553
1554
1555
1556
1557
1558 (copied > 0 &&
1559 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1560 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1561 !inet_csk_in_pingpong_mode(sk))) &&
1562 !atomic_read(&sk->sk_rmem_alloc)))
1563 time_to_ack = true;
1564 }
1565
1566
1567
1568
1569
1570
1571
1572 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1573 __u32 rcv_window_now = tcp_receive_window(tp);
1574
1575
1576 if (2*rcv_window_now <= tp->window_clamp) {
1577 __u32 new_window = __tcp_select_window(sk);
1578
1579
1580
1581
1582
1583
1584 if (new_window && new_window >= 2 * rcv_window_now)
1585 time_to_ack = true;
1586 }
1587 }
1588 if (time_to_ack)
1589 tcp_send_ack(sk);
1590}
1591
1592static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
1593{
1594 __skb_unlink(skb, &sk->sk_receive_queue);
1595 if (likely(skb->destructor == sock_rfree)) {
1596 sock_rfree(skb);
1597 skb->destructor = NULL;
1598 skb->sk = NULL;
1599 return skb_attempt_defer_free(skb);
1600 }
1601 __kfree_skb(skb);
1602}
1603
1604static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1605{
1606 struct sk_buff *skb;
1607 u32 offset;
1608
1609 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1610 offset = seq - TCP_SKB_CB(skb)->seq;
1611 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
1612 pr_err_once("%s: found a SYN, please report !\n", __func__);
1613 offset--;
1614 }
1615 if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
1616 *off = offset;
1617 return skb;
1618 }
1619
1620
1621
1622
1623 tcp_eat_recv_skb(sk, skb);
1624 }
1625 return NULL;
1626}
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1640 sk_read_actor_t recv_actor)
1641{
1642 struct sk_buff *skb;
1643 struct tcp_sock *tp = tcp_sk(sk);
1644 u32 seq = tp->copied_seq;
1645 u32 offset;
1646 int copied = 0;
1647
1648 if (sk->sk_state == TCP_LISTEN)
1649 return -ENOTCONN;
1650 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1651 if (offset < skb->len) {
1652 int used;
1653 size_t len;
1654
1655 len = skb->len - offset;
1656
1657 if (unlikely(tp->urg_data)) {
1658 u32 urg_offset = tp->urg_seq - seq;
1659 if (urg_offset < len)
1660 len = urg_offset;
1661 if (!len)
1662 break;
1663 }
1664 used = recv_actor(desc, skb, offset, len);
1665 if (used <= 0) {
1666 if (!copied)
1667 copied = used;
1668 break;
1669 }
1670 if (WARN_ON_ONCE(used > len))
1671 used = len;
1672 seq += used;
1673 copied += used;
1674 offset += used;
1675
1676
1677
1678
1679
1680
1681 skb = tcp_recv_skb(sk, seq - 1, &offset);
1682 if (!skb)
1683 break;
1684
1685
1686
1687 if (offset + 1 != skb->len)
1688 continue;
1689 }
1690 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1691 tcp_eat_recv_skb(sk, skb);
1692 ++seq;
1693 break;
1694 }
1695 tcp_eat_recv_skb(sk, skb);
1696 if (!desc->count)
1697 break;
1698 WRITE_ONCE(tp->copied_seq, seq);
1699 }
1700 WRITE_ONCE(tp->copied_seq, seq);
1701
1702 tcp_rcv_space_adjust(sk);
1703
1704
1705 if (copied > 0) {
1706 tcp_recv_skb(sk, seq, &offset);
1707 tcp_cleanup_rbuf(sk, copied);
1708 }
1709 return copied;
1710}
1711EXPORT_SYMBOL(tcp_read_sock);
1712
1713int tcp_peek_len(struct socket *sock)
1714{
1715 return tcp_inq(sock->sk);
1716}
1717EXPORT_SYMBOL(tcp_peek_len);
1718
1719
1720int tcp_set_rcvlowat(struct sock *sk, int val)
1721{
1722 int cap;
1723
1724 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1725 cap = sk->sk_rcvbuf >> 1;
1726 else
1727 cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
1728 val = min(val, cap);
1729 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1730
1731
1732 tcp_data_ready(sk);
1733
1734 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1735 return 0;
1736
1737 val <<= 1;
1738 if (val > sk->sk_rcvbuf) {
1739 WRITE_ONCE(sk->sk_rcvbuf, val);
1740 tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
1741 }
1742 return 0;
1743}
1744EXPORT_SYMBOL(tcp_set_rcvlowat);
1745
1746void tcp_update_recv_tstamps(struct sk_buff *skb,
1747 struct scm_timestamping_internal *tss)
1748{
1749 if (skb->tstamp)
1750 tss->ts[0] = ktime_to_timespec64(skb->tstamp);
1751 else
1752 tss->ts[0] = (struct timespec64) {0};
1753
1754 if (skb_hwtstamps(skb)->hwtstamp)
1755 tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
1756 else
1757 tss->ts[2] = (struct timespec64) {0};
1758}
1759
1760#ifdef CONFIG_MMU
1761static const struct vm_operations_struct tcp_vm_ops = {
1762};
1763
1764int tcp_mmap(struct file *file, struct socket *sock,
1765 struct vm_area_struct *vma)
1766{
1767 if (vma->vm_flags & (VM_WRITE | VM_EXEC))
1768 return -EPERM;
1769 vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
1770
1771
1772 vma->vm_flags |= VM_MIXEDMAP;
1773
1774 vma->vm_ops = &tcp_vm_ops;
1775 return 0;
1776}
1777EXPORT_SYMBOL(tcp_mmap);
1778
1779static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb,
1780 u32 *offset_frag)
1781{
1782 skb_frag_t *frag;
1783
1784 if (unlikely(offset_skb >= skb->len))
1785 return NULL;
1786
1787 offset_skb -= skb_headlen(skb);
1788 if ((int)offset_skb < 0 || skb_has_frag_list(skb))
1789 return NULL;
1790
1791 frag = skb_shinfo(skb)->frags;
1792 while (offset_skb) {
1793 if (skb_frag_size(frag) > offset_skb) {
1794 *offset_frag = offset_skb;
1795 return frag;
1796 }
1797 offset_skb -= skb_frag_size(frag);
1798 ++frag;
1799 }
1800 *offset_frag = 0;
1801 return frag;
1802}
1803
1804static bool can_map_frag(const skb_frag_t *frag)
1805{
1806 return skb_frag_size(frag) == PAGE_SIZE && !skb_frag_off(frag);
1807}
1808
1809static int find_next_mappable_frag(const skb_frag_t *frag,
1810 int remaining_in_skb)
1811{
1812 int offset = 0;
1813
1814 if (likely(can_map_frag(frag)))
1815 return 0;
1816
1817 while (offset < remaining_in_skb && !can_map_frag(frag)) {
1818 offset += skb_frag_size(frag);
1819 ++frag;
1820 }
1821 return offset;
1822}
1823
1824static void tcp_zerocopy_set_hint_for_skb(struct sock *sk,
1825 struct tcp_zerocopy_receive *zc,
1826 struct sk_buff *skb, u32 offset)
1827{
1828 u32 frag_offset, partial_frag_remainder = 0;
1829 int mappable_offset;
1830 skb_frag_t *frag;
1831
1832
1833 zc->recv_skip_hint = skb->len - offset;
1834
1835
1836 frag = skb_advance_to_frag(skb, offset, &frag_offset);
1837 if (!frag)
1838 return;
1839
1840 if (frag_offset) {
1841 struct skb_shared_info *info = skb_shinfo(skb);
1842
1843
1844 if (frag == &info->frags[info->nr_frags - 1])
1845 return;
1846
1847
1848 partial_frag_remainder = skb_frag_size(frag) - frag_offset;
1849 zc->recv_skip_hint -= partial_frag_remainder;
1850 ++frag;
1851 }
1852
1853
1854
1855
1856
1857 mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint);
1858 zc->recv_skip_hint = mappable_offset + partial_frag_remainder;
1859}
1860
1861static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
1862 int flags, struct scm_timestamping_internal *tss,
1863 int *cmsg_flags);
1864static int receive_fallback_to_copy(struct sock *sk,
1865 struct tcp_zerocopy_receive *zc, int inq,
1866 struct scm_timestamping_internal *tss)
1867{
1868 unsigned long copy_address = (unsigned long)zc->copybuf_address;
1869 struct msghdr msg = {};
1870 struct iovec iov;
1871 int err;
1872
1873 zc->length = 0;
1874 zc->recv_skip_hint = 0;
1875
1876 if (copy_address != zc->copybuf_address)
1877 return -EINVAL;
1878
1879 err = import_single_range(READ, (void __user *)copy_address,
1880 inq, &iov, &msg.msg_iter);
1881 if (err)
1882 return err;
1883
1884 err = tcp_recvmsg_locked(sk, &msg, inq, MSG_DONTWAIT,
1885 tss, &zc->msg_flags);
1886 if (err < 0)
1887 return err;
1888
1889 zc->copybuf_len = err;
1890 if (likely(zc->copybuf_len)) {
1891 struct sk_buff *skb;
1892 u32 offset;
1893
1894 skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset);
1895 if (skb)
1896 tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset);
1897 }
1898 return 0;
1899}
1900
1901static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
1902 struct sk_buff *skb, u32 copylen,
1903 u32 *offset, u32 *seq)
1904{
1905 unsigned long copy_address = (unsigned long)zc->copybuf_address;
1906 struct msghdr msg = {};
1907 struct iovec iov;
1908 int err;
1909
1910 if (copy_address != zc->copybuf_address)
1911 return -EINVAL;
1912
1913 err = import_single_range(READ, (void __user *)copy_address,
1914 copylen, &iov, &msg.msg_iter);
1915 if (err)
1916 return err;
1917 err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
1918 if (err)
1919 return err;
1920 zc->recv_skip_hint -= copylen;
1921 *offset += copylen;
1922 *seq += copylen;
1923 return (__s32)copylen;
1924}
1925
1926static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc,
1927 struct sock *sk,
1928 struct sk_buff *skb,
1929 u32 *seq,
1930 s32 copybuf_len,
1931 struct scm_timestamping_internal *tss)
1932{
1933 u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
1934
1935 if (!copylen)
1936 return 0;
1937
1938 if (skb) {
1939 offset = *seq - TCP_SKB_CB(skb)->seq;
1940 } else {
1941 skb = tcp_recv_skb(sk, *seq, &offset);
1942 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1943 tcp_update_recv_tstamps(skb, tss);
1944 zc->msg_flags |= TCP_CMSG_TS;
1945 }
1946 }
1947
1948 zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
1949 seq);
1950 return zc->copybuf_len < 0 ? 0 : copylen;
1951}
1952
1953static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
1954 struct page **pending_pages,
1955 unsigned long pages_remaining,
1956 unsigned long *address,
1957 u32 *length,
1958 u32 *seq,
1959 struct tcp_zerocopy_receive *zc,
1960 u32 total_bytes_to_map,
1961 int err)
1962{
1963
1964 if (err == -EBUSY &&
1965 zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) {
1966 u32 maybe_zap_len;
1967
1968 maybe_zap_len = total_bytes_to_map -
1969 *length +
1970 (pages_remaining * PAGE_SIZE);
1971 zap_page_range(vma, *address, maybe_zap_len);
1972 err = 0;
1973 }
1974
1975 if (!err) {
1976 unsigned long leftover_pages = pages_remaining;
1977 int bytes_mapped;
1978
1979
1980 err = vm_insert_pages(vma, *address,
1981 pending_pages,
1982 &pages_remaining);
1983 bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining);
1984 *seq += bytes_mapped;
1985 *address += bytes_mapped;
1986 }
1987 if (err) {
1988
1989
1990
1991
1992
1993 const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
1994
1995 *length -= bytes_not_mapped;
1996 zc->recv_skip_hint += bytes_not_mapped;
1997 }
1998 return err;
1999}
2000
2001static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
2002 struct page **pages,
2003 unsigned int pages_to_map,
2004 unsigned long *address,
2005 u32 *length,
2006 u32 *seq,
2007 struct tcp_zerocopy_receive *zc,
2008 u32 total_bytes_to_map)
2009{
2010 unsigned long pages_remaining = pages_to_map;
2011 unsigned int pages_mapped;
2012 unsigned int bytes_mapped;
2013 int err;
2014
2015 err = vm_insert_pages(vma, *address, pages, &pages_remaining);
2016 pages_mapped = pages_to_map - (unsigned int)pages_remaining;
2017 bytes_mapped = PAGE_SIZE * pages_mapped;
2018
2019
2020
2021 *seq += bytes_mapped;
2022 *address += bytes_mapped;
2023
2024 if (likely(!err))
2025 return 0;
2026
2027
2028 return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped,
2029 pages_remaining, address, length, seq, zc, total_bytes_to_map,
2030 err);
2031}
2032
2033#define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS)
2034static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
2035 struct tcp_zerocopy_receive *zc,
2036 struct scm_timestamping_internal *tss)
2037{
2038 unsigned long msg_control_addr;
2039 struct msghdr cmsg_dummy;
2040
2041 msg_control_addr = (unsigned long)zc->msg_control;
2042 cmsg_dummy.msg_control = (void *)msg_control_addr;
2043 cmsg_dummy.msg_controllen =
2044 (__kernel_size_t)zc->msg_controllen;
2045 cmsg_dummy.msg_flags = in_compat_syscall()
2046 ? MSG_CMSG_COMPAT : 0;
2047 cmsg_dummy.msg_control_is_user = true;
2048 zc->msg_flags = 0;
2049 if (zc->msg_control == msg_control_addr &&
2050 zc->msg_controllen == cmsg_dummy.msg_controllen) {
2051 tcp_recv_timestamp(&cmsg_dummy, sk, tss);
2052 zc->msg_control = (__u64)
2053 ((uintptr_t)cmsg_dummy.msg_control);
2054 zc->msg_controllen =
2055 (__u64)cmsg_dummy.msg_controllen;
2056 zc->msg_flags = (__u32)cmsg_dummy.msg_flags;
2057 }
2058}
2059
2060#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
2061static int tcp_zerocopy_receive(struct sock *sk,
2062 struct tcp_zerocopy_receive *zc,
2063 struct scm_timestamping_internal *tss)
2064{
2065 u32 length = 0, offset, vma_len, avail_len, copylen = 0;
2066 unsigned long address = (unsigned long)zc->address;
2067 struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE];
2068 s32 copybuf_len = zc->copybuf_len;
2069 struct tcp_sock *tp = tcp_sk(sk);
2070 const skb_frag_t *frags = NULL;
2071 unsigned int pages_to_map = 0;
2072 struct vm_area_struct *vma;
2073 struct sk_buff *skb = NULL;
2074 u32 seq = tp->copied_seq;
2075 u32 total_bytes_to_map;
2076 int inq = tcp_inq(sk);
2077 int ret;
2078
2079 zc->copybuf_len = 0;
2080 zc->msg_flags = 0;
2081
2082 if (address & (PAGE_SIZE - 1) || address != zc->address)
2083 return -EINVAL;
2084
2085 if (sk->sk_state == TCP_LISTEN)
2086 return -ENOTCONN;
2087
2088 sock_rps_record_flow(sk);
2089
2090 if (inq && inq <= copybuf_len)
2091 return receive_fallback_to_copy(sk, zc, inq, tss);
2092
2093 if (inq < PAGE_SIZE) {
2094 zc->length = 0;
2095 zc->recv_skip_hint = inq;
2096 if (!inq && sock_flag(sk, SOCK_DONE))
2097 return -EIO;
2098 return 0;
2099 }
2100
2101 mmap_read_lock(current->mm);
2102
2103 vma = vma_lookup(current->mm, address);
2104 if (!vma || vma->vm_ops != &tcp_vm_ops) {
2105 mmap_read_unlock(current->mm);
2106 return -EINVAL;
2107 }
2108 vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
2109 avail_len = min_t(u32, vma_len, inq);
2110 total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1);
2111 if (total_bytes_to_map) {
2112 if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT))
2113 zap_page_range(vma, address, total_bytes_to_map);
2114 zc->length = total_bytes_to_map;
2115 zc->recv_skip_hint = 0;
2116 } else {
2117 zc->length = avail_len;
2118 zc->recv_skip_hint = avail_len;
2119 }
2120 ret = 0;
2121 while (length + PAGE_SIZE <= zc->length) {
2122 int mappable_offset;
2123 struct page *page;
2124
2125 if (zc->recv_skip_hint < PAGE_SIZE) {
2126 u32 offset_frag;
2127
2128 if (skb) {
2129 if (zc->recv_skip_hint > 0)
2130 break;
2131 skb = skb->next;
2132 offset = seq - TCP_SKB_CB(skb)->seq;
2133 } else {
2134 skb = tcp_recv_skb(sk, seq, &offset);
2135 }
2136
2137 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2138 tcp_update_recv_tstamps(skb, tss);
2139 zc->msg_flags |= TCP_CMSG_TS;
2140 }
2141 zc->recv_skip_hint = skb->len - offset;
2142 frags = skb_advance_to_frag(skb, offset, &offset_frag);
2143 if (!frags || offset_frag)
2144 break;
2145 }
2146
2147 mappable_offset = find_next_mappable_frag(frags,
2148 zc->recv_skip_hint);
2149 if (mappable_offset) {
2150 zc->recv_skip_hint = mappable_offset;
2151 break;
2152 }
2153 page = skb_frag_page(frags);
2154 prefetchw(page);
2155 pages[pages_to_map++] = page;
2156 length += PAGE_SIZE;
2157 zc->recv_skip_hint -= PAGE_SIZE;
2158 frags++;
2159 if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE ||
2160 zc->recv_skip_hint < PAGE_SIZE) {
2161
2162
2163
2164 ret = tcp_zerocopy_vm_insert_batch(vma, pages,
2165 pages_to_map,
2166 &address, &length,
2167 &seq, zc,
2168 total_bytes_to_map);
2169 if (ret)
2170 goto out;
2171 pages_to_map = 0;
2172 }
2173 }
2174 if (pages_to_map) {
2175 ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map,
2176 &address, &length, &seq,
2177 zc, total_bytes_to_map);
2178 }
2179out:
2180 mmap_read_unlock(current->mm);
2181
2182 if (!ret)
2183 copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss);
2184
2185 if (length + copylen) {
2186 WRITE_ONCE(tp->copied_seq, seq);
2187 tcp_rcv_space_adjust(sk);
2188
2189
2190 tcp_recv_skb(sk, seq, &offset);
2191 tcp_cleanup_rbuf(sk, length + copylen);
2192 ret = 0;
2193 if (length == zc->length)
2194 zc->recv_skip_hint = 0;
2195 } else {
2196 if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
2197 ret = -EIO;
2198 }
2199 zc->length = length;
2200 return ret;
2201}
2202#endif
2203
2204
2205void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
2206 struct scm_timestamping_internal *tss)
2207{
2208 int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
2209 bool has_timestamping = false;
2210
2211 if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
2212 if (sock_flag(sk, SOCK_RCVTSTAMP)) {
2213 if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
2214 if (new_tstamp) {
2215 struct __kernel_timespec kts = {
2216 .tv_sec = tss->ts[0].tv_sec,
2217 .tv_nsec = tss->ts[0].tv_nsec,
2218 };
2219 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
2220 sizeof(kts), &kts);
2221 } else {
2222 struct __kernel_old_timespec ts_old = {
2223 .tv_sec = tss->ts[0].tv_sec,
2224 .tv_nsec = tss->ts[0].tv_nsec,
2225 };
2226 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
2227 sizeof(ts_old), &ts_old);
2228 }
2229 } else {
2230 if (new_tstamp) {
2231 struct __kernel_sock_timeval stv = {
2232 .tv_sec = tss->ts[0].tv_sec,
2233 .tv_usec = tss->ts[0].tv_nsec / 1000,
2234 };
2235 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
2236 sizeof(stv), &stv);
2237 } else {
2238 struct __kernel_old_timeval tv = {
2239 .tv_sec = tss->ts[0].tv_sec,
2240 .tv_usec = tss->ts[0].tv_nsec / 1000,
2241 };
2242 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
2243 sizeof(tv), &tv);
2244 }
2245 }
2246 }
2247
2248 if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
2249 has_timestamping = true;
2250 else
2251 tss->ts[0] = (struct timespec64) {0};
2252 }
2253
2254 if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
2255 if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
2256 has_timestamping = true;
2257 else
2258 tss->ts[2] = (struct timespec64) {0};
2259 }
2260
2261 if (has_timestamping) {
2262 tss->ts[1] = (struct timespec64) {0};
2263 if (sock_flag(sk, SOCK_TSTAMP_NEW))
2264 put_cmsg_scm_timestamping64(msg, tss);
2265 else
2266 put_cmsg_scm_timestamping(msg, tss);
2267 }
2268}
2269
2270static int tcp_inq_hint(struct sock *sk)
2271{
2272 const struct tcp_sock *tp = tcp_sk(sk);
2273 u32 copied_seq = READ_ONCE(tp->copied_seq);
2274 u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
2275 int inq;
2276
2277 inq = rcv_nxt - copied_seq;
2278 if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
2279 lock_sock(sk);
2280 inq = tp->rcv_nxt - tp->copied_seq;
2281 release_sock(sk);
2282 }
2283
2284
2285
2286 if (inq == 0 && sock_flag(sk, SOCK_DONE))
2287 inq = 1;
2288 return inq;
2289}
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
2300 int flags, struct scm_timestamping_internal *tss,
2301 int *cmsg_flags)
2302{
2303 struct tcp_sock *tp = tcp_sk(sk);
2304 int copied = 0;
2305 u32 peek_seq;
2306 u32 *seq;
2307 unsigned long used;
2308 int err;
2309 int target;
2310 long timeo;
2311 struct sk_buff *skb, *last;
2312 u32 urg_hole = 0;
2313
2314 err = -ENOTCONN;
2315 if (sk->sk_state == TCP_LISTEN)
2316 goto out;
2317
2318 if (tp->recvmsg_inq) {
2319 *cmsg_flags = TCP_CMSG_INQ;
2320 msg->msg_get_inq = 1;
2321 }
2322 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2323
2324
2325 if (flags & MSG_OOB)
2326 goto recv_urg;
2327
2328 if (unlikely(tp->repair)) {
2329 err = -EPERM;
2330 if (!(flags & MSG_PEEK))
2331 goto out;
2332
2333 if (tp->repair_queue == TCP_SEND_QUEUE)
2334 goto recv_sndq;
2335
2336 err = -EINVAL;
2337 if (tp->repair_queue == TCP_NO_QUEUE)
2338 goto out;
2339
2340
2341 }
2342
2343 seq = &tp->copied_seq;
2344 if (flags & MSG_PEEK) {
2345 peek_seq = tp->copied_seq;
2346 seq = &peek_seq;
2347 }
2348
2349 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
2350
2351 do {
2352 u32 offset;
2353
2354
2355 if (unlikely(tp->urg_data) && tp->urg_seq == *seq) {
2356 if (copied)
2357 break;
2358 if (signal_pending(current)) {
2359 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
2360 break;
2361 }
2362 }
2363
2364
2365
2366 last = skb_peek_tail(&sk->sk_receive_queue);
2367 skb_queue_walk(&sk->sk_receive_queue, skb) {
2368 last = skb;
2369
2370
2371
2372 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
2373 "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
2374 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
2375 flags))
2376 break;
2377
2378 offset = *seq - TCP_SKB_CB(skb)->seq;
2379 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
2380 pr_err_once("%s: found a SYN, please report !\n", __func__);
2381 offset--;
2382 }
2383 if (offset < skb->len)
2384 goto found_ok_skb;
2385 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2386 goto found_fin_ok;
2387 WARN(!(flags & MSG_PEEK),
2388 "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
2389 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
2390 }
2391
2392
2393
2394 if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
2395 break;
2396
2397 if (copied) {
2398 if (!timeo ||
2399 sk->sk_err ||
2400 sk->sk_state == TCP_CLOSE ||
2401 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2402 signal_pending(current))
2403 break;
2404 } else {
2405 if (sock_flag(sk, SOCK_DONE))
2406 break;
2407
2408 if (sk->sk_err) {
2409 copied = sock_error(sk);
2410 break;
2411 }
2412
2413 if (sk->sk_shutdown & RCV_SHUTDOWN)
2414 break;
2415
2416 if (sk->sk_state == TCP_CLOSE) {
2417
2418
2419
2420 copied = -ENOTCONN;
2421 break;
2422 }
2423
2424 if (!timeo) {
2425 copied = -EAGAIN;
2426 break;
2427 }
2428
2429 if (signal_pending(current)) {
2430 copied = sock_intr_errno(timeo);
2431 break;
2432 }
2433 }
2434
2435 if (copied >= target) {
2436
2437 __sk_flush_backlog(sk);
2438 } else {
2439 tcp_cleanup_rbuf(sk, copied);
2440 sk_wait_data(sk, &timeo, last);
2441 }
2442
2443 if ((flags & MSG_PEEK) &&
2444 (peek_seq - copied - urg_hole != tp->copied_seq)) {
2445 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
2446 current->comm,
2447 task_pid_nr(current));
2448 peek_seq = tp->copied_seq;
2449 }
2450 continue;
2451
2452found_ok_skb:
2453
2454 used = skb->len - offset;
2455 if (len < used)
2456 used = len;
2457
2458
2459 if (unlikely(tp->urg_data)) {
2460 u32 urg_offset = tp->urg_seq - *seq;
2461 if (urg_offset < used) {
2462 if (!urg_offset) {
2463 if (!sock_flag(sk, SOCK_URGINLINE)) {
2464 WRITE_ONCE(*seq, *seq + 1);
2465 urg_hole++;
2466 offset++;
2467 used--;
2468 if (!used)
2469 goto skip_copy;
2470 }
2471 } else
2472 used = urg_offset;
2473 }
2474 }
2475
2476 if (!(flags & MSG_TRUNC)) {
2477 err = skb_copy_datagram_msg(skb, offset, msg, used);
2478 if (err) {
2479
2480 if (!copied)
2481 copied = -EFAULT;
2482 break;
2483 }
2484 }
2485
2486 WRITE_ONCE(*seq, *seq + used);
2487 copied += used;
2488 len -= used;
2489
2490 tcp_rcv_space_adjust(sk);
2491
2492skip_copy:
2493 if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) {
2494 WRITE_ONCE(tp->urg_data, 0);
2495 tcp_fast_path_check(sk);
2496 }
2497
2498 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2499 tcp_update_recv_tstamps(skb, tss);
2500 *cmsg_flags |= TCP_CMSG_TS;
2501 }
2502
2503 if (used + offset < skb->len)
2504 continue;
2505
2506 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2507 goto found_fin_ok;
2508 if (!(flags & MSG_PEEK))
2509 tcp_eat_recv_skb(sk, skb);
2510 continue;
2511
2512found_fin_ok:
2513
2514 WRITE_ONCE(*seq, *seq + 1);
2515 if (!(flags & MSG_PEEK))
2516 tcp_eat_recv_skb(sk, skb);
2517 break;
2518 } while (len > 0);
2519
2520
2521
2522
2523
2524
2525 tcp_cleanup_rbuf(sk, copied);
2526 return copied;
2527
2528out:
2529 return err;
2530
2531recv_urg:
2532 err = tcp_recv_urg(sk, msg, len, flags);
2533 goto out;
2534
2535recv_sndq:
2536 err = tcp_peek_sndq(sk, msg, len);
2537 goto out;
2538}
2539
2540int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
2541 int *addr_len)
2542{
2543 int cmsg_flags = 0, ret;
2544 struct scm_timestamping_internal tss;
2545
2546 if (unlikely(flags & MSG_ERRQUEUE))
2547 return inet_recv_error(sk, msg, len, addr_len);
2548
2549 if (sk_can_busy_loop(sk) &&
2550 skb_queue_empty_lockless(&sk->sk_receive_queue) &&
2551 sk->sk_state == TCP_ESTABLISHED)
2552 sk_busy_loop(sk, flags & MSG_DONTWAIT);
2553
2554 lock_sock(sk);
2555 ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags);
2556 release_sock(sk);
2557
2558 if ((cmsg_flags || msg->msg_get_inq) && ret >= 0) {
2559 if (cmsg_flags & TCP_CMSG_TS)
2560 tcp_recv_timestamp(msg, sk, &tss);
2561 if (msg->msg_get_inq) {
2562 msg->msg_inq = tcp_inq_hint(sk);
2563 if (cmsg_flags & TCP_CMSG_INQ)
2564 put_cmsg(msg, SOL_TCP, TCP_CM_INQ,
2565 sizeof(msg->msg_inq), &msg->msg_inq);
2566 }
2567 }
2568 return ret;
2569}
2570EXPORT_SYMBOL(tcp_recvmsg);
2571
2572void tcp_set_state(struct sock *sk, int state)
2573{
2574 int oldstate = sk->sk_state;
2575
2576
2577
2578
2579
2580
2581
2582
2583 BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
2584 BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
2585 BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
2586 BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
2587 BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
2588 BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
2589 BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
2590 BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
2591 BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
2592 BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
2593 BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
2594 BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
2595 BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606 BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED);
2607
2608 if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
2609 tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
2610
2611 switch (state) {
2612 case TCP_ESTABLISHED:
2613 if (oldstate != TCP_ESTABLISHED)
2614 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2615 break;
2616
2617 case TCP_CLOSE:
2618 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
2619 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
2620
2621 sk->sk_prot->unhash(sk);
2622 if (inet_csk(sk)->icsk_bind_hash &&
2623 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
2624 inet_put_port(sk);
2625 fallthrough;
2626 default:
2627 if (oldstate == TCP_ESTABLISHED)
2628 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2629 }
2630
2631
2632
2633
2634 inet_sk_state_store(sk, state);
2635}
2636EXPORT_SYMBOL_GPL(tcp_set_state);
2637
2638
2639
2640
2641
2642
2643
2644
2645static const unsigned char new_state[16] = {
2646
2647 [0 ] = TCP_CLOSE,
2648 [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2649 [TCP_SYN_SENT] = TCP_CLOSE,
2650 [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2651 [TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
2652 [TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
2653 [TCP_TIME_WAIT] = TCP_CLOSE,
2654 [TCP_CLOSE] = TCP_CLOSE,
2655 [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
2656 [TCP_LAST_ACK] = TCP_LAST_ACK,
2657 [TCP_LISTEN] = TCP_CLOSE,
2658 [TCP_CLOSING] = TCP_CLOSING,
2659 [TCP_NEW_SYN_RECV] = TCP_CLOSE,
2660};
2661
2662static int tcp_close_state(struct sock *sk)
2663{
2664 int next = (int)new_state[sk->sk_state];
2665 int ns = next & TCP_STATE_MASK;
2666
2667 tcp_set_state(sk, ns);
2668
2669 return next & TCP_ACTION_FIN;
2670}
2671
2672
2673
2674
2675
2676
2677void tcp_shutdown(struct sock *sk, int how)
2678{
2679
2680
2681
2682
2683 if (!(how & SEND_SHUTDOWN))
2684 return;
2685
2686
2687 if ((1 << sk->sk_state) &
2688 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2689 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2690
2691 if (tcp_close_state(sk))
2692 tcp_send_fin(sk);
2693 }
2694}
2695EXPORT_SYMBOL(tcp_shutdown);
2696
2697int tcp_orphan_count_sum(void)
2698{
2699 int i, total = 0;
2700
2701 for_each_possible_cpu(i)
2702 total += per_cpu(tcp_orphan_count, i);
2703
2704 return max(total, 0);
2705}
2706
2707static int tcp_orphan_cache;
2708static struct timer_list tcp_orphan_timer;
2709#define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100)
2710
2711static void tcp_orphan_update(struct timer_list *unused)
2712{
2713 WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum());
2714 mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
2715}
2716
2717static bool tcp_too_many_orphans(int shift)
2718{
2719 return READ_ONCE(tcp_orphan_cache) << shift >
2720 READ_ONCE(sysctl_tcp_max_orphans);
2721}
2722
2723bool tcp_check_oom(struct sock *sk, int shift)
2724{
2725 bool too_many_orphans, out_of_socket_memory;
2726
2727 too_many_orphans = tcp_too_many_orphans(shift);
2728 out_of_socket_memory = tcp_out_of_memory(sk);
2729
2730 if (too_many_orphans)
2731 net_info_ratelimited("too many orphaned sockets\n");
2732 if (out_of_socket_memory)
2733 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2734 return too_many_orphans || out_of_socket_memory;
2735}
2736
2737void __tcp_close(struct sock *sk, long timeout)
2738{
2739 struct sk_buff *skb;
2740 int data_was_unread = 0;
2741 int state;
2742
2743 sk->sk_shutdown = SHUTDOWN_MASK;
2744
2745 if (sk->sk_state == TCP_LISTEN) {
2746 tcp_set_state(sk, TCP_CLOSE);
2747
2748
2749 inet_csk_listen_stop(sk);
2750
2751 goto adjudge_to_death;
2752 }
2753
2754
2755
2756
2757
2758 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2759 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
2760
2761 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2762 len--;
2763 data_was_unread += len;
2764 __kfree_skb(skb);
2765 }
2766
2767 sk_mem_reclaim(sk);
2768
2769
2770 if (sk->sk_state == TCP_CLOSE)
2771 goto adjudge_to_death;
2772
2773
2774
2775
2776
2777
2778
2779
2780 if (unlikely(tcp_sk(sk)->repair)) {
2781 sk->sk_prot->disconnect(sk, 0);
2782 } else if (data_was_unread) {
2783
2784 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2785 tcp_set_state(sk, TCP_CLOSE);
2786 tcp_send_active_reset(sk, sk->sk_allocation);
2787 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2788
2789 sk->sk_prot->disconnect(sk, 0);
2790 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2791 } else if (tcp_close_state(sk)) {
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821 tcp_send_fin(sk);
2822 }
2823
2824 sk_stream_wait_close(sk, timeout);
2825
2826adjudge_to_death:
2827 state = sk->sk_state;
2828 sock_hold(sk);
2829 sock_orphan(sk);
2830
2831 local_bh_disable();
2832 bh_lock_sock(sk);
2833
2834 __release_sock(sk);
2835
2836 this_cpu_inc(tcp_orphan_count);
2837
2838
2839 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2840 goto out;
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856 if (sk->sk_state == TCP_FIN_WAIT2) {
2857 struct tcp_sock *tp = tcp_sk(sk);
2858 if (tp->linger2 < 0) {
2859 tcp_set_state(sk, TCP_CLOSE);
2860 tcp_send_active_reset(sk, GFP_ATOMIC);
2861 __NET_INC_STATS(sock_net(sk),
2862 LINUX_MIB_TCPABORTONLINGER);
2863 } else {
2864 const int tmo = tcp_fin_time(sk);
2865
2866 if (tmo > TCP_TIMEWAIT_LEN) {
2867 inet_csk_reset_keepalive_timer(sk,
2868 tmo - TCP_TIMEWAIT_LEN);
2869 } else {
2870 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2871 goto out;
2872 }
2873 }
2874 }
2875 if (sk->sk_state != TCP_CLOSE) {
2876 sk_mem_reclaim(sk);
2877 if (tcp_check_oom(sk, 0)) {
2878 tcp_set_state(sk, TCP_CLOSE);
2879 tcp_send_active_reset(sk, GFP_ATOMIC);
2880 __NET_INC_STATS(sock_net(sk),
2881 LINUX_MIB_TCPABORTONMEMORY);
2882 } else if (!check_net(sock_net(sk))) {
2883
2884 tcp_set_state(sk, TCP_CLOSE);
2885 }
2886 }
2887
2888 if (sk->sk_state == TCP_CLOSE) {
2889 struct request_sock *req;
2890
2891 req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
2892 lockdep_sock_is_held(sk));
2893
2894
2895
2896
2897 if (req)
2898 reqsk_fastopen_remove(sk, req, false);
2899 inet_csk_destroy_sock(sk);
2900 }
2901
2902
2903out:
2904 bh_unlock_sock(sk);
2905 local_bh_enable();
2906}
2907
2908void tcp_close(struct sock *sk, long timeout)
2909{
2910 lock_sock(sk);
2911 __tcp_close(sk, timeout);
2912 release_sock(sk);
2913 sock_put(sk);
2914}
2915EXPORT_SYMBOL(tcp_close);
2916
2917
2918
2919static inline bool tcp_need_reset(int state)
2920{
2921 return (1 << state) &
2922 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2923 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2924}
2925
2926static void tcp_rtx_queue_purge(struct sock *sk)
2927{
2928 struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
2929
2930 tcp_sk(sk)->highest_sack = NULL;
2931 while (p) {
2932 struct sk_buff *skb = rb_to_skb(p);
2933
2934 p = rb_next(p);
2935
2936
2937
2938 tcp_rtx_queue_unlink(skb, sk);
2939 tcp_wmem_free_skb(sk, skb);
2940 }
2941}
2942
2943void tcp_write_queue_purge(struct sock *sk)
2944{
2945 struct sk_buff *skb;
2946
2947 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
2948 while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
2949 tcp_skb_tsorted_anchor_cleanup(skb);
2950 tcp_wmem_free_skb(sk, skb);
2951 }
2952 tcp_rtx_queue_purge(sk);
2953 INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
2954 sk_mem_reclaim(sk);
2955 tcp_clear_all_retrans_hints(tcp_sk(sk));
2956 tcp_sk(sk)->packets_out = 0;
2957 inet_csk(sk)->icsk_backoff = 0;
2958}
2959
2960int tcp_disconnect(struct sock *sk, int flags)
2961{
2962 struct inet_sock *inet = inet_sk(sk);
2963 struct inet_connection_sock *icsk = inet_csk(sk);
2964 struct tcp_sock *tp = tcp_sk(sk);
2965 int old_state = sk->sk_state;
2966 u32 seq;
2967
2968 if (old_state != TCP_CLOSE)
2969 tcp_set_state(sk, TCP_CLOSE);
2970
2971
2972 if (old_state == TCP_LISTEN) {
2973 inet_csk_listen_stop(sk);
2974 } else if (unlikely(tp->repair)) {
2975 sk->sk_err = ECONNABORTED;
2976 } else if (tcp_need_reset(old_state) ||
2977 (tp->snd_nxt != tp->write_seq &&
2978 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2979
2980
2981
2982 tcp_send_active_reset(sk, gfp_any());
2983 sk->sk_err = ECONNRESET;
2984 } else if (old_state == TCP_SYN_SENT)
2985 sk->sk_err = ECONNRESET;
2986
2987 tcp_clear_xmit_timers(sk);
2988 __skb_queue_purge(&sk->sk_receive_queue);
2989 WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
2990 WRITE_ONCE(tp->urg_data, 0);
2991 tcp_write_queue_purge(sk);
2992 tcp_fastopen_active_disable_ofo_check(sk);
2993 skb_rbtree_purge(&tp->out_of_order_queue);
2994
2995 inet->inet_dport = 0;
2996
2997 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2998 inet_reset_saddr(sk);
2999
3000 sk->sk_shutdown = 0;
3001 sock_reset_flag(sk, SOCK_DONE);
3002 tp->srtt_us = 0;
3003 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
3004 tp->rcv_rtt_last_tsecr = 0;
3005
3006 seq = tp->write_seq + tp->max_window + 2;
3007 if (!seq)
3008 seq = 1;
3009 WRITE_ONCE(tp->write_seq, seq);
3010
3011 icsk->icsk_backoff = 0;
3012 icsk->icsk_probes_out = 0;
3013 icsk->icsk_probes_tstamp = 0;
3014 icsk->icsk_rto = TCP_TIMEOUT_INIT;
3015 icsk->icsk_rto_min = TCP_RTO_MIN;
3016 icsk->icsk_delack_max = TCP_DELACK_MAX;
3017 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
3018 tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
3019 tp->snd_cwnd_cnt = 0;
3020 tp->window_clamp = 0;
3021 tp->delivered = 0;
3022 tp->delivered_ce = 0;
3023 if (icsk->icsk_ca_ops->release)
3024 icsk->icsk_ca_ops->release(sk);
3025 memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
3026 icsk->icsk_ca_initialized = 0;
3027 tcp_set_ca_state(sk, TCP_CA_Open);
3028 tp->is_sack_reneg = 0;
3029 tcp_clear_retrans(tp);
3030 tp->total_retrans = 0;
3031 inet_csk_delack_init(sk);
3032
3033
3034
3035 icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
3036 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
3037 __sk_dst_reset(sk);
3038 dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL));
3039 tcp_saved_syn_free(tp);
3040 tp->compressed_ack = 0;
3041 tp->segs_in = 0;
3042 tp->segs_out = 0;
3043 tp->bytes_sent = 0;
3044 tp->bytes_acked = 0;
3045 tp->bytes_received = 0;
3046 tp->bytes_retrans = 0;
3047 tp->data_segs_in = 0;
3048 tp->data_segs_out = 0;
3049 tp->duplicate_sack[0].start_seq = 0;
3050 tp->duplicate_sack[0].end_seq = 0;
3051 tp->dsack_dups = 0;
3052 tp->reord_seen = 0;
3053 tp->retrans_out = 0;
3054 tp->sacked_out = 0;
3055 tp->tlp_high_seq = 0;
3056 tp->last_oow_ack_time = 0;
3057
3058 tp->app_limited = ~0U;
3059 tp->rack.mstamp = 0;
3060 tp->rack.advanced = 0;
3061 tp->rack.reo_wnd_steps = 1;
3062 tp->rack.last_delivered = 0;
3063 tp->rack.reo_wnd_persist = 0;
3064 tp->rack.dsack_seen = 0;
3065 tp->syn_data_acked = 0;
3066 tp->rx_opt.saw_tstamp = 0;
3067 tp->rx_opt.dsack = 0;
3068 tp->rx_opt.num_sacks = 0;
3069 tp->rcv_ooopack = 0;
3070
3071
3072
3073 tcp_free_fastopen_req(tp);
3074 inet->defer_connect = 0;
3075 tp->fastopen_client_fail = 0;
3076
3077 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
3078
3079 if (sk->sk_frag.page) {
3080 put_page(sk->sk_frag.page);
3081 sk->sk_frag.page = NULL;
3082 sk->sk_frag.offset = 0;
3083 }
3084 sk_error_report(sk);
3085 return 0;
3086}
3087EXPORT_SYMBOL(tcp_disconnect);
3088
3089static inline bool tcp_can_repair_sock(const struct sock *sk)
3090{
3091 return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
3092 (sk->sk_state != TCP_LISTEN);
3093}
3094
3095static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
3096{
3097 struct tcp_repair_window opt;
3098
3099 if (!tp->repair)
3100 return -EPERM;
3101
3102 if (len != sizeof(opt))
3103 return -EINVAL;
3104
3105 if (copy_from_sockptr(&opt, optbuf, sizeof(opt)))
3106 return -EFAULT;
3107
3108 if (opt.max_window < opt.snd_wnd)
3109 return -EINVAL;
3110
3111 if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
3112 return -EINVAL;
3113
3114 if (after(opt.rcv_wup, tp->rcv_nxt))
3115 return -EINVAL;
3116
3117 tp->snd_wl1 = opt.snd_wl1;
3118 tp->snd_wnd = opt.snd_wnd;
3119 tp->max_window = opt.max_window;
3120
3121 tp->rcv_wnd = opt.rcv_wnd;
3122 tp->rcv_wup = opt.rcv_wup;
3123
3124 return 0;
3125}
3126
3127static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
3128 unsigned int len)
3129{
3130 struct tcp_sock *tp = tcp_sk(sk);
3131 struct tcp_repair_opt opt;
3132 size_t offset = 0;
3133
3134 while (len >= sizeof(opt)) {
3135 if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt)))
3136 return -EFAULT;
3137
3138 offset += sizeof(opt);
3139 len -= sizeof(opt);
3140
3141 switch (opt.opt_code) {
3142 case TCPOPT_MSS:
3143 tp->rx_opt.mss_clamp = opt.opt_val;
3144 tcp_mtup_init(sk);
3145 break;
3146 case TCPOPT_WINDOW:
3147 {
3148 u16 snd_wscale = opt.opt_val & 0xFFFF;
3149 u16 rcv_wscale = opt.opt_val >> 16;
3150
3151 if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE)
3152 return -EFBIG;
3153
3154 tp->rx_opt.snd_wscale = snd_wscale;
3155 tp->rx_opt.rcv_wscale = rcv_wscale;
3156 tp->rx_opt.wscale_ok = 1;
3157 }
3158 break;
3159 case TCPOPT_SACK_PERM:
3160 if (opt.opt_val != 0)
3161 return -EINVAL;
3162
3163 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
3164 break;
3165 case TCPOPT_TIMESTAMP:
3166 if (opt.opt_val != 0)
3167 return -EINVAL;
3168
3169 tp->rx_opt.tstamp_ok = 1;
3170 break;
3171 }
3172 }
3173
3174 return 0;
3175}
3176
3177DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
3178EXPORT_SYMBOL(tcp_tx_delay_enabled);
3179
3180static void tcp_enable_tx_delay(void)
3181{
3182 if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
3183 static int __tcp_tx_delay_enabled = 0;
3184
3185 if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
3186 static_branch_enable(&tcp_tx_delay_enabled);
3187 pr_info("TCP_TX_DELAY enabled\n");
3188 }
3189 }
3190}
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201void __tcp_sock_set_cork(struct sock *sk, bool on)
3202{
3203 struct tcp_sock *tp = tcp_sk(sk);
3204
3205 if (on) {
3206 tp->nonagle |= TCP_NAGLE_CORK;
3207 } else {
3208 tp->nonagle &= ~TCP_NAGLE_CORK;
3209 if (tp->nonagle & TCP_NAGLE_OFF)
3210 tp->nonagle |= TCP_NAGLE_PUSH;
3211 tcp_push_pending_frames(sk);
3212 }
3213}
3214
3215void tcp_sock_set_cork(struct sock *sk, bool on)
3216{
3217 lock_sock(sk);
3218 __tcp_sock_set_cork(sk, on);
3219 release_sock(sk);
3220}
3221EXPORT_SYMBOL(tcp_sock_set_cork);
3222
3223
3224
3225
3226
3227
3228
3229void __tcp_sock_set_nodelay(struct sock *sk, bool on)
3230{
3231 if (on) {
3232 tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
3233 tcp_push_pending_frames(sk);
3234 } else {
3235 tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF;
3236 }
3237}
3238
3239void tcp_sock_set_nodelay(struct sock *sk)
3240{
3241 lock_sock(sk);
3242 __tcp_sock_set_nodelay(sk, true);
3243 release_sock(sk);
3244}
3245EXPORT_SYMBOL(tcp_sock_set_nodelay);
3246
3247static void __tcp_sock_set_quickack(struct sock *sk, int val)
3248{
3249 if (!val) {
3250 inet_csk_enter_pingpong_mode(sk);
3251 return;
3252 }
3253
3254 inet_csk_exit_pingpong_mode(sk);
3255 if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
3256 inet_csk_ack_scheduled(sk)) {
3257 inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED;
3258 tcp_cleanup_rbuf(sk, 1);
3259 if (!(val & 1))
3260 inet_csk_enter_pingpong_mode(sk);
3261 }
3262}
3263
3264void tcp_sock_set_quickack(struct sock *sk, int val)
3265{
3266 lock_sock(sk);
3267 __tcp_sock_set_quickack(sk, val);
3268 release_sock(sk);
3269}
3270EXPORT_SYMBOL(tcp_sock_set_quickack);
3271
3272int tcp_sock_set_syncnt(struct sock *sk, int val)
3273{
3274 if (val < 1 || val > MAX_TCP_SYNCNT)
3275 return -EINVAL;
3276
3277 lock_sock(sk);
3278 inet_csk(sk)->icsk_syn_retries = val;
3279 release_sock(sk);
3280 return 0;
3281}
3282EXPORT_SYMBOL(tcp_sock_set_syncnt);
3283
3284void tcp_sock_set_user_timeout(struct sock *sk, u32 val)
3285{
3286 lock_sock(sk);
3287 inet_csk(sk)->icsk_user_timeout = val;
3288 release_sock(sk);
3289}
3290EXPORT_SYMBOL(tcp_sock_set_user_timeout);
3291
3292int tcp_sock_set_keepidle_locked(struct sock *sk, int val)
3293{
3294 struct tcp_sock *tp = tcp_sk(sk);
3295
3296 if (val < 1 || val > MAX_TCP_KEEPIDLE)
3297 return -EINVAL;
3298
3299 tp->keepalive_time = val * HZ;
3300 if (sock_flag(sk, SOCK_KEEPOPEN) &&
3301 !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
3302 u32 elapsed = keepalive_time_elapsed(tp);
3303
3304 if (tp->keepalive_time > elapsed)
3305 elapsed = tp->keepalive_time - elapsed;
3306 else
3307 elapsed = 0;
3308 inet_csk_reset_keepalive_timer(sk, elapsed);
3309 }
3310
3311 return 0;
3312}
3313
3314int tcp_sock_set_keepidle(struct sock *sk, int val)
3315{
3316 int err;
3317
3318 lock_sock(sk);
3319 err = tcp_sock_set_keepidle_locked(sk, val);
3320 release_sock(sk);
3321 return err;
3322}
3323EXPORT_SYMBOL(tcp_sock_set_keepidle);
3324
3325int tcp_sock_set_keepintvl(struct sock *sk, int val)
3326{
3327 if (val < 1 || val > MAX_TCP_KEEPINTVL)
3328 return -EINVAL;
3329
3330 lock_sock(sk);
3331 tcp_sk(sk)->keepalive_intvl = val * HZ;
3332 release_sock(sk);
3333 return 0;
3334}
3335EXPORT_SYMBOL(tcp_sock_set_keepintvl);
3336
3337int tcp_sock_set_keepcnt(struct sock *sk, int val)
3338{
3339 if (val < 1 || val > MAX_TCP_KEEPCNT)
3340 return -EINVAL;
3341
3342 lock_sock(sk);
3343 tcp_sk(sk)->keepalive_probes = val;
3344 release_sock(sk);
3345 return 0;
3346}
3347EXPORT_SYMBOL(tcp_sock_set_keepcnt);
3348
3349int tcp_set_window_clamp(struct sock *sk, int val)
3350{
3351 struct tcp_sock *tp = tcp_sk(sk);
3352
3353 if (!val) {
3354 if (sk->sk_state != TCP_CLOSE)
3355 return -EINVAL;
3356 tp->window_clamp = 0;
3357 } else {
3358 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
3359 SOCK_MIN_RCVBUF / 2 : val;
3360 tp->rcv_ssthresh = min(tp->rcv_wnd, tp->window_clamp);
3361 }
3362 return 0;
3363}
3364
3365
3366
3367
3368static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
3369 sockptr_t optval, unsigned int optlen)
3370{
3371 struct tcp_sock *tp = tcp_sk(sk);
3372 struct inet_connection_sock *icsk = inet_csk(sk);
3373 struct net *net = sock_net(sk);
3374 int val;
3375 int err = 0;
3376
3377
3378 switch (optname) {
3379 case TCP_CONGESTION: {
3380 char name[TCP_CA_NAME_MAX];
3381
3382 if (optlen < 1)
3383 return -EINVAL;
3384
3385 val = strncpy_from_sockptr(name, optval,
3386 min_t(long, TCP_CA_NAME_MAX-1, optlen));
3387 if (val < 0)
3388 return -EFAULT;
3389 name[val] = 0;
3390
3391 lock_sock(sk);
3392 err = tcp_set_congestion_control(sk, name, true,
3393 ns_capable(sock_net(sk)->user_ns,
3394 CAP_NET_ADMIN));
3395 release_sock(sk);
3396 return err;
3397 }
3398 case TCP_ULP: {
3399 char name[TCP_ULP_NAME_MAX];
3400
3401 if (optlen < 1)
3402 return -EINVAL;
3403
3404 val = strncpy_from_sockptr(name, optval,
3405 min_t(long, TCP_ULP_NAME_MAX - 1,
3406 optlen));
3407 if (val < 0)
3408 return -EFAULT;
3409 name[val] = 0;
3410
3411 lock_sock(sk);
3412 err = tcp_set_ulp(sk, name);
3413 release_sock(sk);
3414 return err;
3415 }
3416 case TCP_FASTOPEN_KEY: {
3417 __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
3418 __u8 *backup_key = NULL;
3419
3420
3421
3422
3423 if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
3424 optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
3425 return -EINVAL;
3426
3427 if (copy_from_sockptr(key, optval, optlen))
3428 return -EFAULT;
3429
3430 if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
3431 backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
3432
3433 return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
3434 }
3435 default:
3436
3437 break;
3438 }
3439
3440 if (optlen < sizeof(int))
3441 return -EINVAL;
3442
3443 if (copy_from_sockptr(&val, optval, sizeof(val)))
3444 return -EFAULT;
3445
3446 lock_sock(sk);
3447
3448 switch (optname) {
3449 case TCP_MAXSEG:
3450
3451
3452
3453
3454 if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) {
3455 err = -EINVAL;
3456 break;
3457 }
3458 tp->rx_opt.user_mss = val;
3459 break;
3460
3461 case TCP_NODELAY:
3462 __tcp_sock_set_nodelay(sk, val);
3463 break;
3464
3465 case TCP_THIN_LINEAR_TIMEOUTS:
3466 if (val < 0 || val > 1)
3467 err = -EINVAL;
3468 else
3469 tp->thin_lto = val;
3470 break;
3471
3472 case TCP_THIN_DUPACK:
3473 if (val < 0 || val > 1)
3474 err = -EINVAL;
3475 break;
3476
3477 case TCP_REPAIR:
3478 if (!tcp_can_repair_sock(sk))
3479 err = -EPERM;
3480 else if (val == TCP_REPAIR_ON) {
3481 tp->repair = 1;
3482 sk->sk_reuse = SK_FORCE_REUSE;
3483 tp->repair_queue = TCP_NO_QUEUE;
3484 } else if (val == TCP_REPAIR_OFF) {
3485 tp->repair = 0;
3486 sk->sk_reuse = SK_NO_REUSE;
3487 tcp_send_window_probe(sk);
3488 } else if (val == TCP_REPAIR_OFF_NO_WP) {
3489 tp->repair = 0;
3490 sk->sk_reuse = SK_NO_REUSE;
3491 } else
3492 err = -EINVAL;
3493
3494 break;
3495
3496 case TCP_REPAIR_QUEUE:
3497 if (!tp->repair)
3498 err = -EPERM;
3499 else if ((unsigned int)val < TCP_QUEUES_NR)
3500 tp->repair_queue = val;
3501 else
3502 err = -EINVAL;
3503 break;
3504
3505 case TCP_QUEUE_SEQ:
3506 if (sk->sk_state != TCP_CLOSE) {
3507 err = -EPERM;
3508 } else if (tp->repair_queue == TCP_SEND_QUEUE) {
3509 if (!tcp_rtx_queue_empty(sk))
3510 err = -EPERM;
3511 else
3512 WRITE_ONCE(tp->write_seq, val);
3513 } else if (tp->repair_queue == TCP_RECV_QUEUE) {
3514 if (tp->rcv_nxt != tp->copied_seq) {
3515 err = -EPERM;
3516 } else {
3517 WRITE_ONCE(tp->rcv_nxt, val);
3518 WRITE_ONCE(tp->copied_seq, val);
3519 }
3520 } else {
3521 err = -EINVAL;
3522 }
3523 break;
3524
3525 case TCP_REPAIR_OPTIONS:
3526 if (!tp->repair)
3527 err = -EINVAL;
3528 else if (sk->sk_state == TCP_ESTABLISHED)
3529 err = tcp_repair_options_est(sk, optval, optlen);
3530 else
3531 err = -EPERM;
3532 break;
3533
3534 case TCP_CORK:
3535 __tcp_sock_set_cork(sk, val);
3536 break;
3537
3538 case TCP_KEEPIDLE:
3539 err = tcp_sock_set_keepidle_locked(sk, val);
3540 break;
3541 case TCP_KEEPINTVL:
3542 if (val < 1 || val > MAX_TCP_KEEPINTVL)
3543 err = -EINVAL;
3544 else
3545 tp->keepalive_intvl = val * HZ;
3546 break;
3547 case TCP_KEEPCNT:
3548 if (val < 1 || val > MAX_TCP_KEEPCNT)
3549 err = -EINVAL;
3550 else
3551 tp->keepalive_probes = val;
3552 break;
3553 case TCP_SYNCNT:
3554 if (val < 1 || val > MAX_TCP_SYNCNT)
3555 err = -EINVAL;
3556 else
3557 icsk->icsk_syn_retries = val;
3558 break;
3559
3560 case TCP_SAVE_SYN:
3561
3562 if (val < 0 || val > 2)
3563 err = -EINVAL;
3564 else
3565 tp->save_syn = val;
3566 break;
3567
3568 case TCP_LINGER2:
3569 if (val < 0)
3570 tp->linger2 = -1;
3571 else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
3572 tp->linger2 = TCP_FIN_TIMEOUT_MAX;
3573 else
3574 tp->linger2 = val * HZ;
3575 break;
3576
3577 case TCP_DEFER_ACCEPT:
3578
3579 icsk->icsk_accept_queue.rskq_defer_accept =
3580 secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
3581 TCP_RTO_MAX / HZ);
3582 break;
3583
3584 case TCP_WINDOW_CLAMP:
3585 err = tcp_set_window_clamp(sk, val);
3586 break;
3587
3588 case TCP_QUICKACK:
3589 __tcp_sock_set_quickack(sk, val);
3590 break;
3591
3592#ifdef CONFIG_TCP_MD5SIG
3593 case TCP_MD5SIG:
3594 case TCP_MD5SIG_EXT:
3595 err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
3596 break;
3597#endif
3598 case TCP_USER_TIMEOUT:
3599
3600
3601
3602 if (val < 0)
3603 err = -EINVAL;
3604 else
3605 icsk->icsk_user_timeout = val;
3606 break;
3607
3608 case TCP_FASTOPEN:
3609 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
3610 TCPF_LISTEN))) {
3611 tcp_fastopen_init_key_once(net);
3612
3613 fastopen_queue_tune(sk, val);
3614 } else {
3615 err = -EINVAL;
3616 }
3617 break;
3618 case TCP_FASTOPEN_CONNECT:
3619 if (val > 1 || val < 0) {
3620 err = -EINVAL;
3621 } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) &
3622 TFO_CLIENT_ENABLE) {
3623 if (sk->sk_state == TCP_CLOSE)
3624 tp->fastopen_connect = val;
3625 else
3626 err = -EINVAL;
3627 } else {
3628 err = -EOPNOTSUPP;
3629 }
3630 break;
3631 case TCP_FASTOPEN_NO_COOKIE:
3632 if (val > 1 || val < 0)
3633 err = -EINVAL;
3634 else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3635 err = -EINVAL;
3636 else
3637 tp->fastopen_no_cookie = val;
3638 break;
3639 case TCP_TIMESTAMP:
3640 if (!tp->repair)
3641 err = -EPERM;
3642 else
3643 tp->tsoffset = val - tcp_time_stamp_raw();
3644 break;
3645 case TCP_REPAIR_WINDOW:
3646 err = tcp_repair_set_window(tp, optval, optlen);
3647 break;
3648 case TCP_NOTSENT_LOWAT:
3649 tp->notsent_lowat = val;
3650 sk->sk_write_space(sk);
3651 break;
3652 case TCP_INQ:
3653 if (val > 1 || val < 0)
3654 err = -EINVAL;
3655 else
3656 tp->recvmsg_inq = val;
3657 break;
3658 case TCP_TX_DELAY:
3659 if (val)
3660 tcp_enable_tx_delay();
3661 tp->tcp_tx_delay = val;
3662 break;
3663 default:
3664 err = -ENOPROTOOPT;
3665 break;
3666 }
3667
3668 release_sock(sk);
3669 return err;
3670}
3671
3672int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
3673 unsigned int optlen)
3674{
3675 const struct inet_connection_sock *icsk = inet_csk(sk);
3676
3677 if (level != SOL_TCP)
3678 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
3679 optval, optlen);
3680 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
3681}
3682EXPORT_SYMBOL(tcp_setsockopt);
3683
3684static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
3685 struct tcp_info *info)
3686{
3687 u64 stats[__TCP_CHRONO_MAX], total = 0;
3688 enum tcp_chrono i;
3689
3690 for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
3691 stats[i] = tp->chrono_stat[i - 1];
3692 if (i == tp->chrono_type)
3693 stats[i] += tcp_jiffies32 - tp->chrono_start;
3694 stats[i] *= USEC_PER_SEC / HZ;
3695 total += stats[i];
3696 }
3697
3698 info->tcpi_busy_time = total;
3699 info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
3700 info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
3701}
3702
3703
3704void tcp_get_info(struct sock *sk, struct tcp_info *info)
3705{
3706 const struct tcp_sock *tp = tcp_sk(sk);
3707 const struct inet_connection_sock *icsk = inet_csk(sk);
3708 unsigned long rate;
3709 u32 now;
3710 u64 rate64;
3711 bool slow;
3712
3713 memset(info, 0, sizeof(*info));
3714 if (sk->sk_type != SOCK_STREAM)
3715 return;
3716
3717 info->tcpi_state = inet_sk_state_load(sk);
3718
3719
3720 rate = READ_ONCE(sk->sk_pacing_rate);
3721 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3722 info->tcpi_pacing_rate = rate64;
3723
3724 rate = READ_ONCE(sk->sk_max_pacing_rate);
3725 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3726 info->tcpi_max_pacing_rate = rate64;
3727
3728 info->tcpi_reordering = tp->reordering;
3729 info->tcpi_snd_cwnd = tcp_snd_cwnd(tp);
3730
3731 if (info->tcpi_state == TCP_LISTEN) {
3732
3733
3734
3735
3736 info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
3737 info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog);
3738 return;
3739 }
3740
3741 slow = lock_sock_fast(sk);
3742
3743 info->tcpi_ca_state = icsk->icsk_ca_state;
3744 info->tcpi_retransmits = icsk->icsk_retransmits;
3745 info->tcpi_probes = icsk->icsk_probes_out;
3746 info->tcpi_backoff = icsk->icsk_backoff;
3747
3748 if (tp->rx_opt.tstamp_ok)
3749 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
3750 if (tcp_is_sack(tp))
3751 info->tcpi_options |= TCPI_OPT_SACK;
3752 if (tp->rx_opt.wscale_ok) {
3753 info->tcpi_options |= TCPI_OPT_WSCALE;
3754 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
3755 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
3756 }
3757
3758 if (tp->ecn_flags & TCP_ECN_OK)
3759 info->tcpi_options |= TCPI_OPT_ECN;
3760 if (tp->ecn_flags & TCP_ECN_SEEN)
3761 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
3762 if (tp->syn_data_acked)
3763 info->tcpi_options |= TCPI_OPT_SYN_DATA;
3764
3765 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
3766 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
3767 info->tcpi_snd_mss = tp->mss_cache;
3768 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
3769
3770 info->tcpi_unacked = tp->packets_out;
3771 info->tcpi_sacked = tp->sacked_out;
3772
3773 info->tcpi_lost = tp->lost_out;
3774 info->tcpi_retrans = tp->retrans_out;
3775
3776 now = tcp_jiffies32;
3777 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
3778 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
3779 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
3780
3781 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
3782 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
3783 info->tcpi_rtt = tp->srtt_us >> 3;
3784 info->tcpi_rttvar = tp->mdev_us >> 2;
3785 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
3786 info->tcpi_advmss = tp->advmss;
3787
3788 info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
3789 info->tcpi_rcv_space = tp->rcvq_space.space;
3790
3791 info->tcpi_total_retrans = tp->total_retrans;
3792
3793 info->tcpi_bytes_acked = tp->bytes_acked;
3794 info->tcpi_bytes_received = tp->bytes_received;
3795 info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
3796 tcp_get_info_chrono_stats(tp, info);
3797
3798 info->tcpi_segs_out = tp->segs_out;
3799
3800
3801 info->tcpi_segs_in = READ_ONCE(tp->segs_in);
3802 info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in);
3803
3804 info->tcpi_min_rtt = tcp_min_rtt(tp);
3805 info->tcpi_data_segs_out = tp->data_segs_out;
3806
3807 info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
3808 rate64 = tcp_compute_delivery_rate(tp);
3809 if (rate64)
3810 info->tcpi_delivery_rate = rate64;
3811 info->tcpi_delivered = tp->delivered;
3812 info->tcpi_delivered_ce = tp->delivered_ce;
3813 info->tcpi_bytes_sent = tp->bytes_sent;
3814 info->tcpi_bytes_retrans = tp->bytes_retrans;
3815 info->tcpi_dsack_dups = tp->dsack_dups;
3816 info->tcpi_reord_seen = tp->reord_seen;
3817 info->tcpi_rcv_ooopack = tp->rcv_ooopack;
3818 info->tcpi_snd_wnd = tp->snd_wnd;
3819 info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
3820 unlock_sock_fast(sk, slow);
3821}
3822EXPORT_SYMBOL_GPL(tcp_get_info);
3823
3824static size_t tcp_opt_stats_get_size(void)
3825{
3826 return
3827 nla_total_size_64bit(sizeof(u64)) +
3828 nla_total_size_64bit(sizeof(u64)) +
3829 nla_total_size_64bit(sizeof(u64)) +
3830 nla_total_size_64bit(sizeof(u64)) +
3831 nla_total_size_64bit(sizeof(u64)) +
3832 nla_total_size_64bit(sizeof(u64)) +
3833 nla_total_size_64bit(sizeof(u64)) +
3834 nla_total_size(sizeof(u32)) +
3835 nla_total_size(sizeof(u32)) +
3836 nla_total_size(sizeof(u32)) +
3837 nla_total_size(sizeof(u8)) +
3838 nla_total_size(sizeof(u8)) +
3839 nla_total_size(sizeof(u32)) +
3840 nla_total_size(sizeof(u8)) +
3841 nla_total_size(sizeof(u32)) +
3842 nla_total_size(sizeof(u32)) +
3843 nla_total_size(sizeof(u32)) +
3844 nla_total_size_64bit(sizeof(u64)) +
3845 nla_total_size_64bit(sizeof(u64)) +
3846 nla_total_size(sizeof(u32)) +
3847 nla_total_size(sizeof(u32)) +
3848 nla_total_size(sizeof(u32)) +
3849 nla_total_size(sizeof(u16)) +
3850 nla_total_size(sizeof(u32)) +
3851 nla_total_size_64bit(sizeof(u64)) +
3852 nla_total_size(sizeof(u8)) +
3853 0;
3854}
3855
3856
3857static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb)
3858{
3859 if (skb->protocol == htons(ETH_P_IP))
3860 return ip_hdr(skb)->ttl;
3861 else if (skb->protocol == htons(ETH_P_IPV6))
3862 return ipv6_hdr(skb)->hop_limit;
3863 else
3864 return 0;
3865}
3866
3867struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
3868 const struct sk_buff *orig_skb,
3869 const struct sk_buff *ack_skb)
3870{
3871 const struct tcp_sock *tp = tcp_sk(sk);
3872 struct sk_buff *stats;
3873 struct tcp_info info;
3874 unsigned long rate;
3875 u64 rate64;
3876
3877 stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
3878 if (!stats)
3879 return NULL;
3880
3881 tcp_get_info_chrono_stats(tp, &info);
3882 nla_put_u64_64bit(stats, TCP_NLA_BUSY,
3883 info.tcpi_busy_time, TCP_NLA_PAD);
3884 nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
3885 info.tcpi_rwnd_limited, TCP_NLA_PAD);
3886 nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
3887 info.tcpi_sndbuf_limited, TCP_NLA_PAD);
3888 nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
3889 tp->data_segs_out, TCP_NLA_PAD);
3890 nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
3891 tp->total_retrans, TCP_NLA_PAD);
3892
3893 rate = READ_ONCE(sk->sk_pacing_rate);
3894 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3895 nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
3896
3897 rate64 = tcp_compute_delivery_rate(tp);
3898 nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
3899
3900 nla_put_u32(stats, TCP_NLA_SND_CWND, tcp_snd_cwnd(tp));
3901 nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
3902 nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
3903
3904 nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
3905 nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
3906 nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
3907 nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
3908 nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
3909
3910 nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
3911 nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
3912
3913 nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
3914 TCP_NLA_PAD);
3915 nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
3916 TCP_NLA_PAD);
3917 nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
3918 nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
3919 nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
3920 nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
3921 nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
3922 max_t(int, 0, tp->write_seq - tp->snd_nxt));
3923 nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
3924 TCP_NLA_PAD);
3925 if (ack_skb)
3926 nla_put_u8(stats, TCP_NLA_TTL,
3927 tcp_skb_ttl_or_hop_limit(ack_skb));
3928
3929 return stats;
3930}
3931
3932static int do_tcp_getsockopt(struct sock *sk, int level,
3933 int optname, char __user *optval, int __user *optlen)
3934{
3935 struct inet_connection_sock *icsk = inet_csk(sk);
3936 struct tcp_sock *tp = tcp_sk(sk);
3937 struct net *net = sock_net(sk);
3938 int val, len;
3939
3940 if (get_user(len, optlen))
3941 return -EFAULT;
3942
3943 len = min_t(unsigned int, len, sizeof(int));
3944
3945 if (len < 0)
3946 return -EINVAL;
3947
3948 switch (optname) {
3949 case TCP_MAXSEG:
3950 val = tp->mss_cache;
3951 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3952 val = tp->rx_opt.user_mss;
3953 if (tp->repair)
3954 val = tp->rx_opt.mss_clamp;
3955 break;
3956 case TCP_NODELAY:
3957 val = !!(tp->nonagle&TCP_NAGLE_OFF);
3958 break;
3959 case TCP_CORK:
3960 val = !!(tp->nonagle&TCP_NAGLE_CORK);
3961 break;
3962 case TCP_KEEPIDLE:
3963 val = keepalive_time_when(tp) / HZ;
3964 break;
3965 case TCP_KEEPINTVL:
3966 val = keepalive_intvl_when(tp) / HZ;
3967 break;
3968 case TCP_KEEPCNT:
3969 val = keepalive_probes(tp);
3970 break;
3971 case TCP_SYNCNT:
3972 val = icsk->icsk_syn_retries ? :
3973 READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);
3974 break;
3975 case TCP_LINGER2:
3976 val = tp->linger2;
3977 if (val >= 0)
3978 val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ;
3979 break;
3980 case TCP_DEFER_ACCEPT:
3981 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
3982 TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
3983 break;
3984 case TCP_WINDOW_CLAMP:
3985 val = tp->window_clamp;
3986 break;
3987 case TCP_INFO: {
3988 struct tcp_info info;
3989
3990 if (get_user(len, optlen))
3991 return -EFAULT;
3992
3993 tcp_get_info(sk, &info);
3994
3995 len = min_t(unsigned int, len, sizeof(info));
3996 if (put_user(len, optlen))
3997 return -EFAULT;
3998 if (copy_to_user(optval, &info, len))
3999 return -EFAULT;
4000 return 0;
4001 }
4002 case TCP_CC_INFO: {
4003 const struct tcp_congestion_ops *ca_ops;
4004 union tcp_cc_info info;
4005 size_t sz = 0;
4006 int attr;
4007
4008 if (get_user(len, optlen))
4009 return -EFAULT;
4010
4011 ca_ops = icsk->icsk_ca_ops;
4012 if (ca_ops && ca_ops->get_info)
4013 sz = ca_ops->get_info(sk, ~0U, &attr, &info);
4014
4015 len = min_t(unsigned int, len, sz);
4016 if (put_user(len, optlen))
4017 return -EFAULT;
4018 if (copy_to_user(optval, &info, len))
4019 return -EFAULT;
4020 return 0;
4021 }
4022 case TCP_QUICKACK:
4023 val = !inet_csk_in_pingpong_mode(sk);
4024 break;
4025
4026 case TCP_CONGESTION:
4027 if (get_user(len, optlen))
4028 return -EFAULT;
4029 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
4030 if (put_user(len, optlen))
4031 return -EFAULT;
4032 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
4033 return -EFAULT;
4034 return 0;
4035
4036 case TCP_ULP:
4037 if (get_user(len, optlen))
4038 return -EFAULT;
4039 len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
4040 if (!icsk->icsk_ulp_ops) {
4041 if (put_user(0, optlen))
4042 return -EFAULT;
4043 return 0;
4044 }
4045 if (put_user(len, optlen))
4046 return -EFAULT;
4047 if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
4048 return -EFAULT;
4049 return 0;
4050
4051 case TCP_FASTOPEN_KEY: {
4052 u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)];
4053 unsigned int key_len;
4054
4055 if (get_user(len, optlen))
4056 return -EFAULT;
4057
4058 key_len = tcp_fastopen_get_cipher(net, icsk, key) *
4059 TCP_FASTOPEN_KEY_LENGTH;
4060 len = min_t(unsigned int, len, key_len);
4061 if (put_user(len, optlen))
4062 return -EFAULT;
4063 if (copy_to_user(optval, key, len))
4064 return -EFAULT;
4065 return 0;
4066 }
4067 case TCP_THIN_LINEAR_TIMEOUTS:
4068 val = tp->thin_lto;
4069 break;
4070
4071 case TCP_THIN_DUPACK:
4072 val = 0;
4073 break;
4074
4075 case TCP_REPAIR:
4076 val = tp->repair;
4077 break;
4078
4079 case TCP_REPAIR_QUEUE:
4080 if (tp->repair)
4081 val = tp->repair_queue;
4082 else
4083 return -EINVAL;
4084 break;
4085
4086 case TCP_REPAIR_WINDOW: {
4087 struct tcp_repair_window opt;
4088
4089 if (get_user(len, optlen))
4090 return -EFAULT;
4091
4092 if (len != sizeof(opt))
4093 return -EINVAL;
4094
4095 if (!tp->repair)
4096 return -EPERM;
4097
4098 opt.snd_wl1 = tp->snd_wl1;
4099 opt.snd_wnd = tp->snd_wnd;
4100 opt.max_window = tp->max_window;
4101 opt.rcv_wnd = tp->rcv_wnd;
4102 opt.rcv_wup = tp->rcv_wup;
4103
4104 if (copy_to_user(optval, &opt, len))
4105 return -EFAULT;
4106 return 0;
4107 }
4108 case TCP_QUEUE_SEQ:
4109 if (tp->repair_queue == TCP_SEND_QUEUE)
4110 val = tp->write_seq;
4111 else if (tp->repair_queue == TCP_RECV_QUEUE)
4112 val = tp->rcv_nxt;
4113 else
4114 return -EINVAL;
4115 break;
4116
4117 case TCP_USER_TIMEOUT:
4118 val = icsk->icsk_user_timeout;
4119 break;
4120
4121 case TCP_FASTOPEN:
4122 val = icsk->icsk_accept_queue.fastopenq.max_qlen;
4123 break;
4124
4125 case TCP_FASTOPEN_CONNECT:
4126 val = tp->fastopen_connect;
4127 break;
4128
4129 case TCP_FASTOPEN_NO_COOKIE:
4130 val = tp->fastopen_no_cookie;
4131 break;
4132
4133 case TCP_TX_DELAY:
4134 val = tp->tcp_tx_delay;
4135 break;
4136
4137 case TCP_TIMESTAMP:
4138 val = tcp_time_stamp_raw() + tp->tsoffset;
4139 break;
4140 case TCP_NOTSENT_LOWAT:
4141 val = tp->notsent_lowat;
4142 break;
4143 case TCP_INQ:
4144 val = tp->recvmsg_inq;
4145 break;
4146 case TCP_SAVE_SYN:
4147 val = tp->save_syn;
4148 break;
4149 case TCP_SAVED_SYN: {
4150 if (get_user(len, optlen))
4151 return -EFAULT;
4152
4153 lock_sock(sk);
4154 if (tp->saved_syn) {
4155 if (len < tcp_saved_syn_len(tp->saved_syn)) {
4156 if (put_user(tcp_saved_syn_len(tp->saved_syn),
4157 optlen)) {
4158 release_sock(sk);
4159 return -EFAULT;
4160 }
4161 release_sock(sk);
4162 return -EINVAL;
4163 }
4164 len = tcp_saved_syn_len(tp->saved_syn);
4165 if (put_user(len, optlen)) {
4166 release_sock(sk);
4167 return -EFAULT;
4168 }
4169 if (copy_to_user(optval, tp->saved_syn->data, len)) {
4170 release_sock(sk);
4171 return -EFAULT;
4172 }
4173 tcp_saved_syn_free(tp);
4174 release_sock(sk);
4175 } else {
4176 release_sock(sk);
4177 len = 0;
4178 if (put_user(len, optlen))
4179 return -EFAULT;
4180 }
4181 return 0;
4182 }
4183#ifdef CONFIG_MMU
4184 case TCP_ZEROCOPY_RECEIVE: {
4185 struct scm_timestamping_internal tss;
4186 struct tcp_zerocopy_receive zc = {};
4187 int err;
4188
4189 if (get_user(len, optlen))
4190 return -EFAULT;
4191 if (len < 0 ||
4192 len < offsetofend(struct tcp_zerocopy_receive, length))
4193 return -EINVAL;
4194 if (unlikely(len > sizeof(zc))) {
4195 err = check_zeroed_user(optval + sizeof(zc),
4196 len - sizeof(zc));
4197 if (err < 1)
4198 return err == 0 ? -EINVAL : err;
4199 len = sizeof(zc);
4200 if (put_user(len, optlen))
4201 return -EFAULT;
4202 }
4203 if (copy_from_user(&zc, optval, len))
4204 return -EFAULT;
4205 if (zc.reserved)
4206 return -EINVAL;
4207 if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS))
4208 return -EINVAL;
4209 lock_sock(sk);
4210 err = tcp_zerocopy_receive(sk, &zc, &tss);
4211 err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
4212 &zc, &len, err);
4213 release_sock(sk);
4214 if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
4215 goto zerocopy_rcv_cmsg;
4216 switch (len) {
4217 case offsetofend(struct tcp_zerocopy_receive, msg_flags):
4218 goto zerocopy_rcv_cmsg;
4219 case offsetofend(struct tcp_zerocopy_receive, msg_controllen):
4220 case offsetofend(struct tcp_zerocopy_receive, msg_control):
4221 case offsetofend(struct tcp_zerocopy_receive, flags):
4222 case offsetofend(struct tcp_zerocopy_receive, copybuf_len):
4223 case offsetofend(struct tcp_zerocopy_receive, copybuf_address):
4224 case offsetofend(struct tcp_zerocopy_receive, err):
4225 goto zerocopy_rcv_sk_err;
4226 case offsetofend(struct tcp_zerocopy_receive, inq):
4227 goto zerocopy_rcv_inq;
4228 case offsetofend(struct tcp_zerocopy_receive, length):
4229 default:
4230 goto zerocopy_rcv_out;
4231 }
4232zerocopy_rcv_cmsg:
4233 if (zc.msg_flags & TCP_CMSG_TS)
4234 tcp_zc_finalize_rx_tstamp(sk, &zc, &tss);
4235 else
4236 zc.msg_flags = 0;
4237zerocopy_rcv_sk_err:
4238 if (!err)
4239 zc.err = sock_error(sk);
4240zerocopy_rcv_inq:
4241 zc.inq = tcp_inq_hint(sk);
4242zerocopy_rcv_out:
4243 if (!err && copy_to_user(optval, &zc, len))
4244 err = -EFAULT;
4245 return err;
4246 }
4247#endif
4248 default:
4249 return -ENOPROTOOPT;
4250 }
4251
4252 if (put_user(len, optlen))
4253 return -EFAULT;
4254 if (copy_to_user(optval, &val, len))
4255 return -EFAULT;
4256 return 0;
4257}
4258
4259bool tcp_bpf_bypass_getsockopt(int level, int optname)
4260{
4261
4262
4263
4264 if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
4265 return true;
4266
4267 return false;
4268}
4269EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt);
4270
4271int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
4272 int __user *optlen)
4273{
4274 struct inet_connection_sock *icsk = inet_csk(sk);
4275
4276 if (level != SOL_TCP)
4277 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
4278 optval, optlen);
4279 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
4280}
4281EXPORT_SYMBOL(tcp_getsockopt);
4282
4283#ifdef CONFIG_TCP_MD5SIG
4284static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
4285static DEFINE_MUTEX(tcp_md5sig_mutex);
4286static bool tcp_md5sig_pool_populated = false;
4287
4288static void __tcp_alloc_md5sig_pool(void)
4289{
4290 struct crypto_ahash *hash;
4291 int cpu;
4292
4293 hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
4294 if (IS_ERR(hash))
4295 return;
4296
4297 for_each_possible_cpu(cpu) {
4298 void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
4299 struct ahash_request *req;
4300
4301 if (!scratch) {
4302 scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
4303 sizeof(struct tcphdr),
4304 GFP_KERNEL,
4305 cpu_to_node(cpu));
4306 if (!scratch)
4307 return;
4308 per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
4309 }
4310 if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
4311 continue;
4312
4313 req = ahash_request_alloc(hash, GFP_KERNEL);
4314 if (!req)
4315 return;
4316
4317 ahash_request_set_callback(req, 0, NULL, NULL);
4318
4319 per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
4320 }
4321
4322
4323
4324 smp_wmb();
4325 tcp_md5sig_pool_populated = true;
4326}
4327
4328bool tcp_alloc_md5sig_pool(void)
4329{
4330 if (unlikely(!tcp_md5sig_pool_populated)) {
4331 mutex_lock(&tcp_md5sig_mutex);
4332
4333 if (!tcp_md5sig_pool_populated) {
4334 __tcp_alloc_md5sig_pool();
4335 if (tcp_md5sig_pool_populated)
4336 static_branch_inc(&tcp_md5_needed);
4337 }
4338
4339 mutex_unlock(&tcp_md5sig_mutex);
4340 }
4341 return tcp_md5sig_pool_populated;
4342}
4343EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
4354{
4355 local_bh_disable();
4356
4357 if (tcp_md5sig_pool_populated) {
4358
4359 smp_rmb();
4360 return this_cpu_ptr(&tcp_md5sig_pool);
4361 }
4362 local_bh_enable();
4363 return NULL;
4364}
4365EXPORT_SYMBOL(tcp_get_md5sig_pool);
4366
4367int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
4368 const struct sk_buff *skb, unsigned int header_len)
4369{
4370 struct scatterlist sg;
4371 const struct tcphdr *tp = tcp_hdr(skb);
4372 struct ahash_request *req = hp->md5_req;
4373 unsigned int i;
4374 const unsigned int head_data_len = skb_headlen(skb) > header_len ?
4375 skb_headlen(skb) - header_len : 0;
4376 const struct skb_shared_info *shi = skb_shinfo(skb);
4377 struct sk_buff *frag_iter;
4378
4379 sg_init_table(&sg, 1);
4380
4381 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
4382 ahash_request_set_crypt(req, &sg, NULL, head_data_len);
4383 if (crypto_ahash_update(req))
4384 return 1;
4385
4386 for (i = 0; i < shi->nr_frags; ++i) {
4387 const skb_frag_t *f = &shi->frags[i];
4388 unsigned int offset = skb_frag_off(f);
4389 struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
4390
4391 sg_set_page(&sg, page, skb_frag_size(f),
4392 offset_in_page(offset));
4393 ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
4394 if (crypto_ahash_update(req))
4395 return 1;
4396 }
4397
4398 skb_walk_frags(skb, frag_iter)
4399 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
4400 return 1;
4401
4402 return 0;
4403}
4404EXPORT_SYMBOL(tcp_md5_hash_skb_data);
4405
4406int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
4407{
4408 u8 keylen = READ_ONCE(key->keylen);
4409 struct scatterlist sg;
4410
4411 sg_init_one(&sg, key->key, keylen);
4412 ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen);
4413
4414
4415 return data_race(crypto_ahash_update(hp->md5_req));
4416}
4417EXPORT_SYMBOL(tcp_md5_hash_key);
4418
4419
4420enum skb_drop_reason
4421tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
4422 const void *saddr, const void *daddr,
4423 int family, int dif, int sdif)
4424{
4425
4426
4427
4428
4429
4430
4431
4432
4433 const __u8 *hash_location = NULL;
4434 struct tcp_md5sig_key *hash_expected;
4435 const struct tcphdr *th = tcp_hdr(skb);
4436 struct tcp_sock *tp = tcp_sk(sk);
4437 int genhash, l3index;
4438 u8 newhash[16];
4439
4440
4441
4442
4443 l3index = sdif ? dif : 0;
4444
4445 hash_expected = tcp_md5_do_lookup(sk, l3index, saddr, family);
4446 hash_location = tcp_parse_md5sig_option(th);
4447
4448
4449 if (!hash_expected && !hash_location)
4450 return SKB_NOT_DROPPED_YET;
4451
4452 if (hash_expected && !hash_location) {
4453 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
4454 return SKB_DROP_REASON_TCP_MD5NOTFOUND;
4455 }
4456
4457 if (!hash_expected && hash_location) {
4458 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
4459 return SKB_DROP_REASON_TCP_MD5UNEXPECTED;
4460 }
4461
4462
4463
4464
4465
4466 if (family == AF_INET)
4467 genhash = tcp_v4_md5_hash_skb(newhash,
4468 hash_expected,
4469 NULL, skb);
4470 else
4471 genhash = tp->af_specific->calc_md5_hash(newhash,
4472 hash_expected,
4473 NULL, skb);
4474
4475 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
4476 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
4477 if (family == AF_INET) {
4478 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
4479 saddr, ntohs(th->source),
4480 daddr, ntohs(th->dest),
4481 genhash ? " tcp_v4_calc_md5_hash failed"
4482 : "", l3index);
4483 } else {
4484 net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u L3 index %d\n",
4485 genhash ? "failed" : "mismatch",
4486 saddr, ntohs(th->source),
4487 daddr, ntohs(th->dest), l3index);
4488 }
4489 return SKB_DROP_REASON_TCP_MD5FAILURE;
4490 }
4491 return SKB_NOT_DROPPED_YET;
4492}
4493EXPORT_SYMBOL(tcp_inbound_md5_hash);
4494
4495#endif
4496
4497void tcp_done(struct sock *sk)
4498{
4499 struct request_sock *req;
4500
4501
4502
4503
4504
4505 req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);
4506
4507 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
4508 TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
4509
4510 tcp_set_state(sk, TCP_CLOSE);
4511 tcp_clear_xmit_timers(sk);
4512 if (req)
4513 reqsk_fastopen_remove(sk, req, false);
4514
4515 sk->sk_shutdown = SHUTDOWN_MASK;
4516
4517 if (!sock_flag(sk, SOCK_DEAD))
4518 sk->sk_state_change(sk);
4519 else
4520 inet_csk_destroy_sock(sk);
4521}
4522EXPORT_SYMBOL_GPL(tcp_done);
4523
4524int tcp_abort(struct sock *sk, int err)
4525{
4526 if (!sk_fullsock(sk)) {
4527 if (sk->sk_state == TCP_NEW_SYN_RECV) {
4528 struct request_sock *req = inet_reqsk(sk);
4529
4530 local_bh_disable();
4531 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
4532 local_bh_enable();
4533 return 0;
4534 }
4535 return -EOPNOTSUPP;
4536 }
4537
4538
4539 lock_sock(sk);
4540
4541 if (sk->sk_state == TCP_LISTEN) {
4542 tcp_set_state(sk, TCP_CLOSE);
4543 inet_csk_listen_stop(sk);
4544 }
4545
4546
4547 local_bh_disable();
4548 bh_lock_sock(sk);
4549
4550 if (!sock_flag(sk, SOCK_DEAD)) {
4551 sk->sk_err = err;
4552
4553 smp_wmb();
4554 sk_error_report(sk);
4555 if (tcp_need_reset(sk->sk_state))
4556 tcp_send_active_reset(sk, GFP_ATOMIC);
4557 tcp_done(sk);
4558 }
4559
4560 bh_unlock_sock(sk);
4561 local_bh_enable();
4562 tcp_write_queue_purge(sk);
4563 release_sock(sk);
4564 return 0;
4565}
4566EXPORT_SYMBOL_GPL(tcp_abort);
4567
4568extern struct tcp_congestion_ops tcp_reno;
4569
4570static __initdata unsigned long thash_entries;
4571static int __init set_thash_entries(char *str)
4572{
4573 ssize_t ret;
4574
4575 if (!str)
4576 return 0;
4577
4578 ret = kstrtoul(str, 0, &thash_entries);
4579 if (ret)
4580 return 0;
4581
4582 return 1;
4583}
4584__setup("thash_entries=", set_thash_entries);
4585
4586static void __init tcp_init_mem(void)
4587{
4588 unsigned long limit = nr_free_buffer_pages() / 16;
4589
4590 limit = max(limit, 128UL);
4591 sysctl_tcp_mem[0] = limit / 4 * 3;
4592 sysctl_tcp_mem[1] = limit;
4593 sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
4594}
4595
4596void __init tcp_init(void)
4597{
4598 int max_rshare, max_wshare, cnt;
4599 unsigned long limit;
4600 unsigned int i;
4601
4602 BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
4603 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
4604 sizeof_field(struct sk_buff, cb));
4605
4606 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
4607
4608 timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
4609 mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
4610
4611 inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
4612 thash_entries, 21,
4613 0, 64 * 1024);
4614 tcp_hashinfo.bind_bucket_cachep =
4615 kmem_cache_create("tcp_bind_bucket",
4616 sizeof(struct inet_bind_bucket), 0,
4617 SLAB_HWCACHE_ALIGN | SLAB_PANIC |
4618 SLAB_ACCOUNT,
4619 NULL);
4620
4621
4622
4623
4624
4625
4626 tcp_hashinfo.ehash =
4627 alloc_large_system_hash("TCP established",
4628 sizeof(struct inet_ehash_bucket),
4629 thash_entries,
4630 17,
4631 0,
4632 NULL,
4633 &tcp_hashinfo.ehash_mask,
4634 0,
4635 thash_entries ? 0 : 512 * 1024);
4636 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
4637 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
4638
4639 if (inet_ehash_locks_alloc(&tcp_hashinfo))
4640 panic("TCP: failed to alloc ehash_locks");
4641 tcp_hashinfo.bhash =
4642 alloc_large_system_hash("TCP bind",
4643 sizeof(struct inet_bind_hashbucket),
4644 tcp_hashinfo.ehash_mask + 1,
4645 17,
4646 0,
4647 &tcp_hashinfo.bhash_size,
4648 NULL,
4649 0,
4650 64 * 1024);
4651 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
4652 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
4653 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
4654 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
4655 }
4656
4657
4658 cnt = tcp_hashinfo.ehash_mask + 1;
4659 sysctl_tcp_max_orphans = cnt / 2;
4660
4661 tcp_init_mem();
4662
4663 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
4664 max_wshare = min(4UL*1024*1024, limit);
4665 max_rshare = min(6UL*1024*1024, limit);
4666
4667 init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
4668 init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
4669 init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
4670
4671 init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
4672 init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
4673 init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
4674
4675 pr_info("Hash tables configured (established %u bind %u)\n",
4676 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
4677
4678 tcp_v4_init();
4679 tcp_metrics_init();
4680 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
4681 tcp_tasklet_init();
4682 mptcp_init();
4683}
4684